An Exception was encountered at 'In [7]'.
This notebook in intended as a generic notebook to be used with the papermill python library to allow automated generation of analyses and reports for classifiers on microbiome data generated by kraken2 pipeline
cd /project/src
[Errno 2] No such file or directory: '/project/src' /project/6011811/data/microbiome_OJS/workflow
from sklearn import model_selection
from sklearn import metrics
import os
import re
import copy
import numpy as np
import pandas as pd
import sys
sys.path.insert(0, '/project/6011811/data/microbiome_OJS/workflow/src/')
from MicroBiome import MicroBiomeDataSet, Trainer, TrainTester, MultiTrainTester, list_transformer, DiffExpTransform
from ScoreFunctions import *
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn import linear_model as LM
import seaborn as sns
import pickle as pk
from matplotlib import pyplot as plt
# Ignore warning messages
if True:
import warnings
warnings.filterwarnings('ignore')
input_dir = '/project/data/preprocessed/PE_50K_sex_complete'
output_dir = '/project/results/LR_Classifier_DE_w_clinical_generic'
retrain = True
fdr_DE = 0.05
# Parameters
input_dir = "results/kraken2_PE_500/notebooks/PE_500_Sex/prepped_data"
output_dir = "results/kraken2_PE_500/notebooks/PE_500_Sex/LR_DE_clinical"
retrain = True
fdr_DE = 0.05
os.listdir(input_dir)
['meta_data_mat.pk', 'metadata_samples_keep.csv', 'y.pk', 'feat_meta.csv', 'X.pk']
infile_X = open(os.path.join(input_dir, 'X.pk'),'rb')
X = pk.load(infile_X)
infile_X.close()
infile_y = open(os.path.join(input_dir, 'y.pk'),'rb')
y = pk.load(infile_y)
infile_y.close()
infile_meta_data_mat = open(os.path.join(input_dir, 'meta_data_mat.pk'), 'rb')
meta_data_mat = pk.load(infile_meta_data_mat)
infile_meta_data_mat.close()
# model input
X_inp = np.concatenate([X, meta_data_mat], axis=1)
Execution using papermill encountered an exception here and stopped:
n_splits = 10
out_path = os.path.join(output_dir, 'MyMultiTrainTester.pk')
if retrain:
# clear previous results, if any
if os.path.exists(output_dir):
os.system('rm -rf ' + output_dir)
os.mkdir(output_dir)
# we run PCA on microbiome data and leave clinical data intact
MyLogistic = LM.LogisticRegression(random_state=42, class_weight='balanced',
penalty='elasticnet', solver='saga', l1_ratio=0.5)
MatrixPipeLine = Pipeline([('DE0', DiffExpTransform(fdr=fdr_DE)), ('scaler0', StandardScaler()), ('pca', PCA())])
MyPipeLine = ColumnTransformer([('MatrixPipeLine', MatrixPipeLine, np.arange(0, X.shape[1])),
('MetaData', 'passthrough', [X.shape[1]])])
clf = Pipeline([('FullData', MyPipeLine), ('LR', MyLogistic)])
param_grid = dict(LR__C=np.exp(-np.arange(-10, 10)),
FullData__MatrixPipeLine__pca__n_components=np.arange(3,4))
model=model_selection.GridSearchCV(clf, param_grid, scoring=metrics.make_scorer(metrics.f1_score), cv = 5)
# Trainer
MyTrainer = Trainer(model=model)
# random seed used in class definition is not used in final output models
MyTrainTester = TrainTester(MyTrainer, metrics.f1_score)
# note that random seed here affects sequence of seeds passed to making new TrainTester objects
# using LRTrainTester as template. Thus, you have all settings but seed affecting sample split
# across all data splits
MyMultiTrainTester = MultiTrainTester(MyTrainTester, numpy_rand_seed=42, n_splits=n_splits)
MyMultiTrainTester.train(X_inp, y)
# save results
outfile = open(out_path,'wb')
pk.dump(MyMultiTrainTester, outfile)
outfile.close()
else:
# load previous results
infile = open(out_path,'rb')
MyMultiTrainTester = pk.load(infile)
infile.close()
Running for split 1 of 10
--------------------------------------------------------------------------- NotFittedError Traceback (most recent call last) /tmp/ipykernel_12071/421879363.py in <module> 25 # across all data splits 26 MyMultiTrainTester = MultiTrainTester(MyTrainTester, numpy_rand_seed=42, n_splits=n_splits) ---> 27 MyMultiTrainTester.train(X_inp, y) 28 # save results 29 outfile = open(out_path,'wb') ~/src/MicroBiome.py in train(self, X, y, **args) 628 TrainTesterCopy = copy.deepcopy(self.template) 629 TrainTesterCopy.rand_state = seed_sequence[i] --> 630 TrainTesterCopy.train(X, y, **args) 631 self.train_scores.append(TrainTesterCopy.train_score) 632 self.test_scores.append(TrainTesterCopy.test_score) ~/src/MicroBiome.py in train(self, X, y, do_validation, **args) 535 self.history = history 536 else: --> 537 self.Trainer.fit(X_train, y_train, **args) 538 539 ~/src/MicroBiome.py in fit(self, X, y, epochs, class_weight, batch_size, validation_data) 428 429 if validation_data is None: --> 430 self.model.fit(X, y) 431 else: 432 self.model.fit(X, y, epochs=epochs, class_weight=class_weight, batch_size=batch_size, validation_data=validation_data) /src/anaconda/envs/microbiome/lib/python3.7/site-packages/sklearn/utils/validation.py in inner_f(*args, **kwargs) 61 extra_args = len(args) - len(all_args) 62 if extra_args <= 0: ---> 63 return f(*args, **kwargs) 64 65 # extra_args > 0 /src/anaconda/envs/microbiome/lib/python3.7/site-packages/sklearn/model_selection/_search.py in fit(self, X, y, groups, **fit_params) 839 return results 840 --> 841 self._run_search(evaluate_candidates) 842 843 # multimetric is determined here because in the case of a callable /src/anaconda/envs/microbiome/lib/python3.7/site-packages/sklearn/model_selection/_search.py in _run_search(self, evaluate_candidates) 1294 def _run_search(self, evaluate_candidates): 1295 """Search all candidates in param_grid""" -> 1296 evaluate_candidates(ParameterGrid(self.param_grid)) 1297 1298 /src/anaconda/envs/microbiome/lib/python3.7/site-packages/sklearn/model_selection/_search.py in evaluate_candidates(candidate_params, cv, more_results) 825 # of out will be done in `_insert_error_scores`. 826 if callable(self.scoring): --> 827 _insert_error_scores(out, self.error_score) 828 all_candidate_params.extend(candidate_params) 829 all_out.extend(out) /src/anaconda/envs/microbiome/lib/python3.7/site-packages/sklearn/model_selection/_validation.py in _insert_error_scores(results, error_score) 299 300 if successful_score is None: --> 301 raise NotFittedError("All estimators failed to fit") 302 303 if isinstance(successful_score, dict): NotFittedError: All estimators failed to fit
scores_df = pd.DataFrame({'score': MyMultiTrainTester.train_scores, 'stage' : np.repeat('train', n_splits)})
scores_df = scores_df.append(pd.DataFrame({'score': MyMultiTrainTester.test_scores, 'stage' : np.repeat('test', n_splits)}))
scores_df
sns.boxplot(data = scores_df, x = 'stage', y = 'score')
# hyperparams = {'l1_ratio': [], 'C': []}
feats_in_split = []
hyperparams = {'C': [], 'n_feats': [], 'n_components': []}
for i in range(n_splits):
hyperparams['C'].append(MyMultiTrainTester.TrainerList[i].model.best_params_['LR__C'])
selected_feats_i = copy.deepcopy(MyMultiTrainTester.TrainerList[i].model.best_estimator_['FullData'].transformers_[0][1]['DE0'].selected_feats)
feats_in_split.append(selected_feats_i)
hyperparams['n_feats'].append(np.sum(selected_feats_i))
hyperparams['n_components'].append(MyMultiTrainTester.TrainerList[i].model.best_params_['FullData__MatrixPipeLine__pca__n_components'])
hyperparams_df = pd.DataFrame(hyperparams)
hyperparams_df
scoring_metrics = MyMultiTrainTester.getScores()
scoring_metrics_df = pd.DataFrame(scoring_metrics)
scoring_metrics_df.to_csv(os.path.join(output_dir, 'scoring_metrics.csv'))
sns.set(rc={"figure.figsize":(12, 8)})
sns.boxplot(data = scoring_metrics_df, x = 'score_type', y = 'value')
score_names = np.unique(scoring_metrics['score_type'])
score_stats = {'score': [], 'mean': [], 'median': [], 'std_dev': []}
for score in score_names:
score_stats['score'].append(score)
score_vect = scoring_metrics_df['value'].to_numpy()[scoring_metrics_df['score_type'] == score]
score_stats['mean'].append(np.mean(score_vect))
score_stats['median'].append(np.median(score_vect))
score_stats['std_dev'].append(np.std(score_vect))
score_stats_df = pd.DataFrame(score_stats)
score_stats_df
score_stats_df.to_csv('score_stats.csv')
MyMultiTrainTester.plot_confusion(normalize=True, figsize=(15,25))
MyMultiTrainTester.plot_confusion(normalize=False, figsize=(15,25))
MyMultiTrainTester.plot_class_freq(normalize=True, figsize=(15,35))
MyMultiTrainTester.plot_precrecall(figsize=(15,35))
An Exception was encountered at 'In [7]'.
This notebook in intended as a generic notebook to be used with the papermill python library to allow automated generation of analyses and reports for classifiers on microbiome data generated by kraken2 pipeline
cd /project/src
[Errno 2] No such file or directory: '/project/src' /project/6011811/data/microbiome_OJS/workflow
from sklearn import model_selection
from sklearn import metrics
import os
import re
import copy
import numpy as np
import pandas as pd
import sys
sys.path.insert(0, '/project/6011811/data/microbiome_OJS/workflow/src/')
from MicroBiome import MicroBiomeDataSet, Trainer, TrainTester, MultiTrainTester, list_transformer, DiffExpTransform
from ScoreFunctions import *
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn import linear_model as LM
import seaborn as sns
import pickle as pk
from matplotlib import pyplot as plt
# Ignore warning messages
if True:
import warnings
warnings.filterwarnings('ignore')
input_dir = '/project/data/preprocessed/PE_50K_sex_complete'
output_dir = '/project/results/LR_Classifier_DE_w_clinical_generic'
retrain = True
fdr_DE = 0.05
# Parameters
input_dir = "results/kraken2_PE_1K/notebooks/PE_1K_Sex/prepped_data"
output_dir = "results/kraken2_PE_1K/notebooks/PE_1K_Sex/LR_DE_clinical"
retrain = True
fdr_DE = 0.05
os.listdir(input_dir)
['meta_data_mat.pk', 'metadata_samples_keep.csv', 'y.pk', 'feat_meta.csv', 'X.pk']
infile_X = open(os.path.join(input_dir, 'X.pk'),'rb')
X = pk.load(infile_X)
infile_X.close()
infile_y = open(os.path.join(input_dir, 'y.pk'),'rb')
y = pk.load(infile_y)
infile_y.close()
infile_meta_data_mat = open(os.path.join(input_dir, 'meta_data_mat.pk'), 'rb')
meta_data_mat = pk.load(infile_meta_data_mat)
infile_meta_data_mat.close()
# model input
X_inp = np.concatenate([X, meta_data_mat], axis=1)
Execution using papermill encountered an exception here and stopped:
n_splits = 10
out_path = os.path.join(output_dir, 'MyMultiTrainTester.pk')
if retrain:
# clear previous results, if any
if os.path.exists(output_dir):
os.system('rm -rf ' + output_dir)
os.mkdir(output_dir)
# we run PCA on microbiome data and leave clinical data intact
MyLogistic = LM.LogisticRegression(random_state=42, class_weight='balanced',
penalty='elasticnet', solver='saga', l1_ratio=0.5)
MatrixPipeLine = Pipeline([('DE0', DiffExpTransform(fdr=fdr_DE)), ('scaler0', StandardScaler()), ('pca', PCA())])
MyPipeLine = ColumnTransformer([('MatrixPipeLine', MatrixPipeLine, np.arange(0, X.shape[1])),
('MetaData', 'passthrough', [X.shape[1]])])
clf = Pipeline([('FullData', MyPipeLine), ('LR', MyLogistic)])
param_grid = dict(LR__C=np.exp(-np.arange(-10, 10)),
FullData__MatrixPipeLine__pca__n_components=np.arange(3,4))
model=model_selection.GridSearchCV(clf, param_grid, scoring=metrics.make_scorer(metrics.f1_score), cv = 5)
# Trainer
MyTrainer = Trainer(model=model)
# random seed used in class definition is not used in final output models
MyTrainTester = TrainTester(MyTrainer, metrics.f1_score)
# note that random seed here affects sequence of seeds passed to making new TrainTester objects
# using LRTrainTester as template. Thus, you have all settings but seed affecting sample split
# across all data splits
MyMultiTrainTester = MultiTrainTester(MyTrainTester, numpy_rand_seed=42, n_splits=n_splits)
MyMultiTrainTester.train(X_inp, y)
# save results
outfile = open(out_path,'wb')
pk.dump(MyMultiTrainTester, outfile)
outfile.close()
else:
# load previous results
infile = open(out_path,'rb')
MyMultiTrainTester = pk.load(infile)
infile.close()
Running for split 1 of 10
--------------------------------------------------------------------------- ValueError Traceback (most recent call last) /tmp/ipykernel_9130/421879363.py in <module> 25 # across all data splits 26 MyMultiTrainTester = MultiTrainTester(MyTrainTester, numpy_rand_seed=42, n_splits=n_splits) ---> 27 MyMultiTrainTester.train(X_inp, y) 28 # save results 29 outfile = open(out_path,'wb') ~/src/MicroBiome.py in train(self, X, y, **args) 628 TrainTesterCopy = copy.deepcopy(self.template) 629 TrainTesterCopy.rand_state = seed_sequence[i] --> 630 TrainTesterCopy.train(X, y, **args) 631 self.train_scores.append(TrainTesterCopy.train_score) 632 self.test_scores.append(TrainTesterCopy.test_score) ~/src/MicroBiome.py in train(self, X, y, do_validation, **args) 535 self.history = history 536 else: --> 537 self.Trainer.fit(X_train, y_train, **args) 538 539 ~/src/MicroBiome.py in fit(self, X, y, epochs, class_weight, batch_size, validation_data) 428 429 if validation_data is None: --> 430 self.model.fit(X, y) 431 else: 432 self.model.fit(X, y, epochs=epochs, class_weight=class_weight, batch_size=batch_size, validation_data=validation_data) /src/anaconda/envs/microbiome/lib/python3.7/site-packages/sklearn/utils/validation.py in inner_f(*args, **kwargs) 61 extra_args = len(args) - len(all_args) 62 if extra_args <= 0: ---> 63 return f(*args, **kwargs) 64 65 # extra_args > 0 /src/anaconda/envs/microbiome/lib/python3.7/site-packages/sklearn/model_selection/_search.py in fit(self, X, y, groups, **fit_params) 878 refit_start_time = time.time() 879 if y is not None: --> 880 self.best_estimator_.fit(X, y, **fit_params) 881 else: 882 self.best_estimator_.fit(X, **fit_params) /src/anaconda/envs/microbiome/lib/python3.7/site-packages/sklearn/pipeline.py in fit(self, X, y, **fit_params) 339 """ 340 fit_params_steps = self._check_fit_params(**fit_params) --> 341 Xt = self._fit(X, y, **fit_params_steps) 342 with _print_elapsed_time('Pipeline', 343 self._log_message(len(self.steps) - 1)): /src/anaconda/envs/microbiome/lib/python3.7/site-packages/sklearn/pipeline.py in _fit(self, X, y, **fit_params_steps) 305 message_clsname='Pipeline', 306 message=self._log_message(step_idx), --> 307 **fit_params_steps[name]) 308 # Replace the transformer of the step with the fitted 309 # transformer. This is necessary when loading the transformer /src/anaconda/envs/microbiome/lib/python3.7/site-packages/joblib/memory.py in __call__(self, *args, **kwargs) 350 351 def __call__(self, *args, **kwargs): --> 352 return self.func(*args, **kwargs) 353 354 def call_and_shelve(self, *args, **kwargs): /src/anaconda/envs/microbiome/lib/python3.7/site-packages/sklearn/pipeline.py in _fit_transform_one(transformer, X, y, weight, message_clsname, message, **fit_params) 752 with _print_elapsed_time(message_clsname, message): 753 if hasattr(transformer, 'fit_transform'): --> 754 res = transformer.fit_transform(X, y, **fit_params) 755 else: 756 res = transformer.fit(X, y, **fit_params).transform(X) /src/anaconda/envs/microbiome/lib/python3.7/site-packages/sklearn/compose/_column_transformer.py in fit_transform(self, X, y) 505 self._validate_remainder(X) 506 --> 507 result = self._fit_transform(X, y, _fit_transform_one) 508 509 if not result: /src/anaconda/envs/microbiome/lib/python3.7/site-packages/sklearn/compose/_column_transformer.py in _fit_transform(self, X, y, func, fitted) 441 message=self._log_message(name, idx, len(transformers))) 442 for idx, (name, trans, column, weight) in enumerate( --> 443 self._iter(fitted=fitted, replace_strings=True), 1)) 444 except ValueError as e: 445 if "Expected 2D array, got 1D array instead" in str(e): /src/anaconda/envs/microbiome/lib/python3.7/site-packages/joblib/parallel.py in __call__(self, iterable) 1039 # remaining jobs. 1040 self._iterating = False -> 1041 if self.dispatch_one_batch(iterator): 1042 self._iterating = self._original_iterator is not None 1043 /src/anaconda/envs/microbiome/lib/python3.7/site-packages/joblib/parallel.py in dispatch_one_batch(self, iterator) 857 return False 858 else: --> 859 self._dispatch(tasks) 860 return True 861 /src/anaconda/envs/microbiome/lib/python3.7/site-packages/joblib/parallel.py in _dispatch(self, batch) 775 with self._lock: 776 job_idx = len(self._jobs) --> 777 job = self._backend.apply_async(batch, callback=cb) 778 # A job can complete so quickly than its callback is 779 # called before we get here, causing self._jobs to /src/anaconda/envs/microbiome/lib/python3.7/site-packages/joblib/_parallel_backends.py in apply_async(self, func, callback) 206 def apply_async(self, func, callback=None): 207 """Schedule a func to be run""" --> 208 result = ImmediateResult(func) 209 if callback: 210 callback(result) /src/anaconda/envs/microbiome/lib/python3.7/site-packages/joblib/_parallel_backends.py in __init__(self, batch) 570 # Don't delay the application, to avoid keeping the input 571 # arguments in memory --> 572 self.results = batch() 573 574 def get(self): /src/anaconda/envs/microbiome/lib/python3.7/site-packages/joblib/parallel.py in __call__(self) 261 with parallel_backend(self._backend, n_jobs=self._n_jobs): 262 return [func(*args, **kwargs) --> 263 for func, args, kwargs in self.items] 264 265 def __reduce__(self): /src/anaconda/envs/microbiome/lib/python3.7/site-packages/joblib/parallel.py in <listcomp>(.0) 261 with parallel_backend(self._backend, n_jobs=self._n_jobs): 262 return [func(*args, **kwargs) --> 263 for func, args, kwargs in self.items] 264 265 def __reduce__(self): /src/anaconda/envs/microbiome/lib/python3.7/site-packages/sklearn/utils/fixes.py in __call__(self, *args, **kwargs) 220 def __call__(self, *args, **kwargs): 221 with config_context(**self.config): --> 222 return self.function(*args, **kwargs) /src/anaconda/envs/microbiome/lib/python3.7/site-packages/sklearn/pipeline.py in _fit_transform_one(transformer, X, y, weight, message_clsname, message, **fit_params) 752 with _print_elapsed_time(message_clsname, message): 753 if hasattr(transformer, 'fit_transform'): --> 754 res = transformer.fit_transform(X, y, **fit_params) 755 else: 756 res = transformer.fit(X, y, **fit_params).transform(X) /src/anaconda/envs/microbiome/lib/python3.7/site-packages/sklearn/pipeline.py in fit_transform(self, X, y, **fit_params) 385 fit_params_last_step = fit_params_steps[self.steps[-1][0]] 386 if hasattr(last_step, 'fit_transform'): --> 387 return last_step.fit_transform(Xt, y, **fit_params_last_step) 388 else: 389 return last_step.fit(Xt, y, /src/anaconda/envs/microbiome/lib/python3.7/site-packages/sklearn/decomposition/_pca.py in fit_transform(self, X, y) 381 C-ordered array, use 'np.ascontiguousarray'. 382 """ --> 383 U, S, Vt = self._fit(X) 384 U = U[:, :self.n_components_] 385 /src/anaconda/envs/microbiome/lib/python3.7/site-packages/sklearn/decomposition/_pca.py in _fit(self, X) 428 # Call different fits for either full or truncated SVD 429 if self._fit_svd_solver == 'full': --> 430 return self._fit_full(X, n_components) 431 elif self._fit_svd_solver in ['arpack', 'randomized']: 432 return self._fit_truncated(X, n_components, self._fit_svd_solver) /src/anaconda/envs/microbiome/lib/python3.7/site-packages/sklearn/decomposition/_pca.py in _fit_full(self, X, n_components) 447 "min(n_samples, n_features)=%r with " 448 "svd_solver='full'" --> 449 % (n_components, min(n_samples, n_features))) 450 elif n_components >= 1: 451 if not isinstance(n_components, numbers.Integral): ValueError: n_components=3 must be between 0 and min(n_samples, n_features)=1 with svd_solver='full'
scores_df = pd.DataFrame({'score': MyMultiTrainTester.train_scores, 'stage' : np.repeat('train', n_splits)})
scores_df = scores_df.append(pd.DataFrame({'score': MyMultiTrainTester.test_scores, 'stage' : np.repeat('test', n_splits)}))
scores_df
sns.boxplot(data = scores_df, x = 'stage', y = 'score')
# hyperparams = {'l1_ratio': [], 'C': []}
feats_in_split = []
hyperparams = {'C': [], 'n_feats': [], 'n_components': []}
for i in range(n_splits):
hyperparams['C'].append(MyMultiTrainTester.TrainerList[i].model.best_params_['LR__C'])
selected_feats_i = copy.deepcopy(MyMultiTrainTester.TrainerList[i].model.best_estimator_['FullData'].transformers_[0][1]['DE0'].selected_feats)
feats_in_split.append(selected_feats_i)
hyperparams['n_feats'].append(np.sum(selected_feats_i))
hyperparams['n_components'].append(MyMultiTrainTester.TrainerList[i].model.best_params_['FullData__MatrixPipeLine__pca__n_components'])
hyperparams_df = pd.DataFrame(hyperparams)
hyperparams_df
scoring_metrics = MyMultiTrainTester.getScores()
scoring_metrics_df = pd.DataFrame(scoring_metrics)
scoring_metrics_df.to_csv(os.path.join(output_dir, 'scoring_metrics.csv'))
sns.set(rc={"figure.figsize":(12, 8)})
sns.boxplot(data = scoring_metrics_df, x = 'score_type', y = 'value')
score_names = np.unique(scoring_metrics['score_type'])
score_stats = {'score': [], 'mean': [], 'median': [], 'std_dev': []}
for score in score_names:
score_stats['score'].append(score)
score_vect = scoring_metrics_df['value'].to_numpy()[scoring_metrics_df['score_type'] == score]
score_stats['mean'].append(np.mean(score_vect))
score_stats['median'].append(np.median(score_vect))
score_stats['std_dev'].append(np.std(score_vect))
score_stats_df = pd.DataFrame(score_stats)
score_stats_df
score_stats_df.to_csv('score_stats.csv')
MyMultiTrainTester.plot_confusion(normalize=True, figsize=(15,25))
MyMultiTrainTester.plot_confusion(normalize=False, figsize=(15,25))
MyMultiTrainTester.plot_class_freq(normalize=True, figsize=(15,35))
MyMultiTrainTester.plot_precrecall(figsize=(15,35))
This notebook in intended as a generic notebook to be used with the papermill python library to allow automated generation of analyses and reports for classifiers on microbiome data generated by kraken2 pipeline
cd /project/src
[Errno 2] No such file or directory: '/project/src' /project/6011811/data/microbiome_OJS/workflow
from sklearn import model_selection
from sklearn import metrics
import os
import re
import copy
import numpy as np
import pandas as pd
import sys
sys.path.insert(0, '/project/6011811/data/microbiome_OJS/workflow/src/')
from MicroBiome import MicroBiomeDataSet, Trainer, TrainTester, MultiTrainTester, list_transformer, DiffExpTransform
from ScoreFunctions import *
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn import linear_model as LM
import seaborn as sns
import pickle as pk
from matplotlib import pyplot as plt
# Ignore warning messages
if True:
import warnings
warnings.filterwarnings('ignore')
input_dir = '/project/data/preprocessed/PE_50K_sex_complete'
output_dir = '/project/results/LR_Classifier_DE_w_clinical_generic'
retrain = True
fdr_DE = 0.05
# Parameters
input_dir = "results/kraken2_PE_5K/notebooks/PE_5K_Sex/prepped_data"
output_dir = "results/kraken2_PE_5K/notebooks/PE_5K_Sex/LR_DE_clinical"
retrain = True
fdr_DE = 0.05
os.listdir(input_dir)
['meta_data_mat.pk', 'metadata_samples_keep.csv', 'y.pk', 'feat_meta.csv', 'X.pk']
infile_X = open(os.path.join(input_dir, 'X.pk'),'rb')
X = pk.load(infile_X)
infile_X.close()
infile_y = open(os.path.join(input_dir, 'y.pk'),'rb')
y = pk.load(infile_y)
infile_y.close()
infile_meta_data_mat = open(os.path.join(input_dir, 'meta_data_mat.pk'), 'rb')
meta_data_mat = pk.load(infile_meta_data_mat)
infile_meta_data_mat.close()
# model input
X_inp = np.concatenate([X, meta_data_mat], axis=1)
n_splits = 10
out_path = os.path.join(output_dir, 'MyMultiTrainTester.pk')
if retrain:
# clear previous results, if any
if os.path.exists(output_dir):
os.system('rm -rf ' + output_dir)
os.mkdir(output_dir)
# we run PCA on microbiome data and leave clinical data intact
MyLogistic = LM.LogisticRegression(random_state=42, class_weight='balanced',
penalty='elasticnet', solver='saga', l1_ratio=0.5)
MatrixPipeLine = Pipeline([('DE0', DiffExpTransform(fdr=fdr_DE)), ('scaler0', StandardScaler()), ('pca', PCA())])
MyPipeLine = ColumnTransformer([('MatrixPipeLine', MatrixPipeLine, np.arange(0, X.shape[1])),
('MetaData', 'passthrough', [X.shape[1]])])
clf = Pipeline([('FullData', MyPipeLine), ('LR', MyLogistic)])
param_grid = dict(LR__C=np.exp(-np.arange(-10, 10)),
FullData__MatrixPipeLine__pca__n_components=np.arange(3,4))
model=model_selection.GridSearchCV(clf, param_grid, scoring=metrics.make_scorer(metrics.f1_score), cv = 5)
# Trainer
MyTrainer = Trainer(model=model)
# random seed used in class definition is not used in final output models
MyTrainTester = TrainTester(MyTrainer, metrics.f1_score)
# note that random seed here affects sequence of seeds passed to making new TrainTester objects
# using LRTrainTester as template. Thus, you have all settings but seed affecting sample split
# across all data splits
MyMultiTrainTester = MultiTrainTester(MyTrainTester, numpy_rand_seed=42, n_splits=n_splits)
MyMultiTrainTester.train(X_inp, y)
# save results
outfile = open(out_path,'wb')
pk.dump(MyMultiTrainTester, outfile)
outfile.close()
else:
# load previous results
infile = open(out_path,'rb')
MyMultiTrainTester = pk.load(infile)
infile.close()
Running for split 1 of 10 Using predict_proba getting predictions from probs Running for split 2 of 10 Using predict_proba getting predictions from probs Running for split 3 of 10 Using predict_proba getting predictions from probs Running for split 4 of 10 Using predict_proba getting predictions from probs Running for split 5 of 10 Using predict_proba getting predictions from probs Running for split 6 of 10 Using predict_proba getting predictions from probs Running for split 7 of 10 Using predict_proba getting predictions from probs Running for split 8 of 10 Using predict_proba getting predictions from probs Running for split 9 of 10 Using predict_proba getting predictions from probs Running for split 10 of 10 Using predict_proba getting predictions from probs
scores_df = pd.DataFrame({'score': MyMultiTrainTester.train_scores, 'stage' : np.repeat('train', n_splits)})
scores_df = scores_df.append(pd.DataFrame({'score': MyMultiTrainTester.test_scores, 'stage' : np.repeat('test', n_splits)}))
scores_df
| score | stage | |
|---|---|---|
| 0 | 0.720238 | train |
| 1 | 0.774775 | train |
| 2 | 0.772861 | train |
| 3 | 0.744745 | train |
| 4 | 0.745342 | train |
| 5 | 0.745342 | train |
| 6 | 0.791541 | train |
| 7 | 0.710098 | train |
| 8 | 0.730061 | train |
| 9 | 0.761610 | train |
| 0 | 0.808989 | test |
| 1 | 0.717949 | test |
| 2 | 0.597015 | test |
| 3 | 0.774194 | test |
| 4 | 0.747253 | test |
| 5 | 0.747253 | test |
| 6 | 0.683544 | test |
| 7 | 0.800000 | test |
| 8 | 0.795455 | test |
| 9 | 0.776471 | test |
sns.boxplot(data = scores_df, x = 'stage', y = 'score')
<AxesSubplot:xlabel='stage', ylabel='score'>
# hyperparams = {'l1_ratio': [], 'C': []}
feats_in_split = []
hyperparams = {'C': [], 'n_feats': [], 'n_components': []}
for i in range(n_splits):
hyperparams['C'].append(MyMultiTrainTester.TrainerList[i].model.best_params_['LR__C'])
selected_feats_i = copy.deepcopy(MyMultiTrainTester.TrainerList[i].model.best_estimator_['FullData'].transformers_[0][1]['DE0'].selected_feats)
feats_in_split.append(selected_feats_i)
hyperparams['n_feats'].append(np.sum(selected_feats_i))
hyperparams['n_components'].append(MyMultiTrainTester.TrainerList[i].model.best_params_['FullData__MatrixPipeLine__pca__n_components'])
hyperparams_df = pd.DataFrame(hyperparams)
hyperparams_df
| C | n_feats | n_components | |
|---|---|---|---|
| 0 | 22026.465795 | 11 | 3 |
| 1 | 22026.465795 | 12 | 3 |
| 2 | 22026.465795 | 11 | 3 |
| 3 | 22026.465795 | 12 | 3 |
| 4 | 22026.465795 | 9 | 3 |
| 5 | 22026.465795 | 9 | 3 |
| 6 | 22026.465795 | 20 | 3 |
| 7 | 22026.465795 | 4 | 3 |
| 8 | 22026.465795 | 13 | 3 |
| 9 | 22026.465795 | 18 | 3 |
scoring_metrics = MyMultiTrainTester.getScores()
scoring_metrics_df = pd.DataFrame(scoring_metrics)
scoring_metrics_df.to_csv(os.path.join(output_dir, 'scoring_metrics.csv'))
sns.set(rc={"figure.figsize":(12, 8)})
sns.boxplot(data = scoring_metrics_df, x = 'score_type', y = 'value')
<AxesSubplot:xlabel='score_type', ylabel='value'>
score_names = np.unique(scoring_metrics['score_type'])
score_stats = {'score': [], 'mean': [], 'median': [], 'std_dev': []}
for score in score_names:
score_stats['score'].append(score)
score_vect = scoring_metrics_df['value'].to_numpy()[scoring_metrics_df['score_type'] == score]
score_stats['mean'].append(np.mean(score_vect))
score_stats['median'].append(np.median(score_vect))
score_stats['std_dev'].append(np.std(score_vect))
score_stats_df = pd.DataFrame(score_stats)
score_stats_df
| score | mean | median | std_dev | |
|---|---|---|---|---|
| 0 | AUPRC_NEG | 0.706184 | 0.712486 | 0.053917 |
| 1 | AUPRC_POS | 0.803110 | 0.815688 | 0.055067 |
| 2 | AUROC_NEG | 0.767862 | 0.767575 | 0.033590 |
| 3 | AUROC_POS | 0.767862 | 0.767575 | 0.033590 |
| 4 | f1_score | 0.744812 | 0.760723 | 0.061584 |
| 5 | npv_score | 0.663684 | 0.666667 | 0.043315 |
| 6 | ppv_score | 0.763018 | 0.772727 | 0.057771 |
| 7 | sensitivity | 0.729079 | 0.744681 | 0.072402 |
| 8 | specificity | 0.700998 | 0.715441 | 0.051021 |
score_stats_df.to_csv('score_stats.csv')
MyMultiTrainTester.plot_confusion(normalize=True, figsize=(15,25))
MyMultiTrainTester.plot_confusion(normalize=False, figsize=(15,25))
MyMultiTrainTester.plot_class_freq(normalize=True, figsize=(15,35))
MyMultiTrainTester.plot_precrecall(figsize=(15,35))
This notebook in intended as a generic notebook to be used with the papermill python library to allow automated generation of analyses and reports for classifiers on microbiome data generated by kraken2 pipeline
cd /project/src
[Errno 2] No such file or directory: '/project/src' /project/6011811/data/microbiome_OJS/workflow
from sklearn import model_selection
from sklearn import metrics
import os
import re
import copy
import numpy as np
import pandas as pd
import sys
sys.path.insert(0, '/project/6011811/data/microbiome_OJS/workflow/src/')
from MicroBiome import MicroBiomeDataSet, Trainer, TrainTester, MultiTrainTester, list_transformer, DiffExpTransform
from ScoreFunctions import *
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn import linear_model as LM
import seaborn as sns
import pickle as pk
from matplotlib import pyplot as plt
# Ignore warning messages
if True:
import warnings
warnings.filterwarnings('ignore')
input_dir = '/project/data/preprocessed/PE_50K_sex_complete'
output_dir = '/project/results/LR_Classifier_DE_w_clinical_generic'
retrain = True
fdr_DE = 0.05
# Parameters
input_dir = "results/kraken2_PE_10K/notebooks/PE_10K_Sex/prepped_data"
output_dir = "results/kraken2_PE_10K/notebooks/PE_10K_Sex/LR_DE_clinical"
retrain = True
fdr_DE = 0.05
os.listdir(input_dir)
['meta_data_mat.pk', 'metadata_samples_keep.csv', 'y.pk', 'feat_meta.csv', 'X.pk']
infile_X = open(os.path.join(input_dir, 'X.pk'),'rb')
X = pk.load(infile_X)
infile_X.close()
infile_y = open(os.path.join(input_dir, 'y.pk'),'rb')
y = pk.load(infile_y)
infile_y.close()
infile_meta_data_mat = open(os.path.join(input_dir, 'meta_data_mat.pk'), 'rb')
meta_data_mat = pk.load(infile_meta_data_mat)
infile_meta_data_mat.close()
# model input
X_inp = np.concatenate([X, meta_data_mat], axis=1)
n_splits = 10
out_path = os.path.join(output_dir, 'MyMultiTrainTester.pk')
if retrain:
# clear previous results, if any
if os.path.exists(output_dir):
os.system('rm -rf ' + output_dir)
os.mkdir(output_dir)
# we run PCA on microbiome data and leave clinical data intact
MyLogistic = LM.LogisticRegression(random_state=42, class_weight='balanced',
penalty='elasticnet', solver='saga', l1_ratio=0.5)
MatrixPipeLine = Pipeline([('DE0', DiffExpTransform(fdr=fdr_DE)), ('scaler0', StandardScaler()), ('pca', PCA())])
MyPipeLine = ColumnTransformer([('MatrixPipeLine', MatrixPipeLine, np.arange(0, X.shape[1])),
('MetaData', 'passthrough', [X.shape[1]])])
clf = Pipeline([('FullData', MyPipeLine), ('LR', MyLogistic)])
param_grid = dict(LR__C=np.exp(-np.arange(-10, 10)),
FullData__MatrixPipeLine__pca__n_components=np.arange(3,4))
model=model_selection.GridSearchCV(clf, param_grid, scoring=metrics.make_scorer(metrics.f1_score), cv = 5)
# Trainer
MyTrainer = Trainer(model=model)
# random seed used in class definition is not used in final output models
MyTrainTester = TrainTester(MyTrainer, metrics.f1_score)
# note that random seed here affects sequence of seeds passed to making new TrainTester objects
# using LRTrainTester as template. Thus, you have all settings but seed affecting sample split
# across all data splits
MyMultiTrainTester = MultiTrainTester(MyTrainTester, numpy_rand_seed=42, n_splits=n_splits)
MyMultiTrainTester.train(X_inp, y)
# save results
outfile = open(out_path,'wb')
pk.dump(MyMultiTrainTester, outfile)
outfile.close()
else:
# load previous results
infile = open(out_path,'rb')
MyMultiTrainTester = pk.load(infile)
infile.close()
Running for split 1 of 10 Using predict_proba getting predictions from probs Running for split 2 of 10 Using predict_proba getting predictions from probs Running for split 3 of 10 Using predict_proba getting predictions from probs Running for split 4 of 10 Using predict_proba getting predictions from probs Running for split 5 of 10 Using predict_proba getting predictions from probs Running for split 6 of 10 Using predict_proba getting predictions from probs Running for split 7 of 10 Using predict_proba getting predictions from probs Running for split 8 of 10 Using predict_proba getting predictions from probs Running for split 9 of 10 Using predict_proba getting predictions from probs Running for split 10 of 10 Using predict_proba getting predictions from probs
scores_df = pd.DataFrame({'score': MyMultiTrainTester.train_scores, 'stage' : np.repeat('train', n_splits)})
scores_df = scores_df.append(pd.DataFrame({'score': MyMultiTrainTester.test_scores, 'stage' : np.repeat('test', n_splits)}))
scores_df
| score | stage | |
|---|---|---|
| 0 | 0.786164 | train |
| 1 | 0.791411 | train |
| 2 | 0.791789 | train |
| 3 | 0.750000 | train |
| 4 | 0.773585 | train |
| 5 | 0.773585 | train |
| 6 | 0.779874 | train |
| 7 | 0.748466 | train |
| 8 | 0.755287 | train |
| 9 | 0.764890 | train |
| 0 | 0.752941 | test |
| 1 | 0.714286 | test |
| 2 | 0.656716 | test |
| 3 | 0.786517 | test |
| 4 | 0.769231 | test |
| 5 | 0.769231 | test |
| 6 | 0.763158 | test |
| 7 | 0.795699 | test |
| 8 | 0.734177 | test |
| 9 | 0.819277 | test |
sns.boxplot(data = scores_df, x = 'stage', y = 'score')
<AxesSubplot:xlabel='stage', ylabel='score'>
# hyperparams = {'l1_ratio': [], 'C': []}
feats_in_split = []
hyperparams = {'C': [], 'n_feats': [], 'n_components': []}
for i in range(n_splits):
hyperparams['C'].append(MyMultiTrainTester.TrainerList[i].model.best_params_['LR__C'])
selected_feats_i = copy.deepcopy(MyMultiTrainTester.TrainerList[i].model.best_estimator_['FullData'].transformers_[0][1]['DE0'].selected_feats)
feats_in_split.append(selected_feats_i)
hyperparams['n_feats'].append(np.sum(selected_feats_i))
hyperparams['n_components'].append(MyMultiTrainTester.TrainerList[i].model.best_params_['FullData__MatrixPipeLine__pca__n_components'])
hyperparams_df = pd.DataFrame(hyperparams)
hyperparams_df
| C | n_feats | n_components | |
|---|---|---|---|
| 0 | 22026.465795 | 21 | 3 |
| 1 | 22026.465795 | 31 | 3 |
| 2 | 20.085537 | 20 | 3 |
| 3 | 22026.465795 | 16 | 3 |
| 4 | 22026.465795 | 25 | 3 |
| 5 | 22026.465795 | 25 | 3 |
| 6 | 0.135335 | 29 | 3 |
| 7 | 22026.465795 | 6 | 3 |
| 8 | 22026.465795 | 18 | 3 |
| 9 | 2.718282 | 28 | 3 |
scoring_metrics = MyMultiTrainTester.getScores()
scoring_metrics_df = pd.DataFrame(scoring_metrics)
scoring_metrics_df.to_csv(os.path.join(output_dir, 'scoring_metrics.csv'))
sns.set(rc={"figure.figsize":(12, 8)})
sns.boxplot(data = scoring_metrics_df, x = 'score_type', y = 'value')
<AxesSubplot:xlabel='score_type', ylabel='value'>
score_names = np.unique(scoring_metrics['score_type'])
score_stats = {'score': [], 'mean': [], 'median': [], 'std_dev': []}
for score in score_names:
score_stats['score'].append(score)
score_vect = scoring_metrics_df['value'].to_numpy()[scoring_metrics_df['score_type'] == score]
score_stats['mean'].append(np.mean(score_vect))
score_stats['median'].append(np.median(score_vect))
score_stats['std_dev'].append(np.std(score_vect))
score_stats_df = pd.DataFrame(score_stats)
score_stats_df
| score | mean | median | std_dev | |
|---|---|---|---|---|
| 0 | AUPRC_NEG | 0.729456 | 0.723616 | 0.056464 |
| 1 | AUPRC_POS | 0.822583 | 0.827733 | 0.051697 |
| 2 | AUROC_NEG | 0.790356 | 0.790650 | 0.029024 |
| 3 | AUROC_POS | 0.790356 | 0.790650 | 0.029024 |
| 4 | f1_score | 0.756123 | 0.766194 | 0.043584 |
| 5 | npv_score | 0.671307 | 0.656863 | 0.049498 |
| 6 | ppv_score | 0.788067 | 0.795455 | 0.042526 |
| 7 | sensitivity | 0.728844 | 0.744135 | 0.058773 |
| 8 | specificity | 0.735925 | 0.725806 | 0.057283 |
score_stats_df.to_csv('score_stats.csv')
MyMultiTrainTester.plot_confusion(normalize=True, figsize=(15,25))
MyMultiTrainTester.plot_confusion(normalize=False, figsize=(15,25))
MyMultiTrainTester.plot_class_freq(normalize=True, figsize=(15,35))
MyMultiTrainTester.plot_precrecall(figsize=(15,35))
This notebook in intended as a generic notebook to be used with the papermill python library to allow automated generation of analyses and reports for classifiers on microbiome data generated by kraken2 pipeline
cd /project/src
[Errno 2] No such file or directory: '/project/src' /project/6011811/data/microbiome_OJS/workflow
from sklearn import model_selection
from sklearn import metrics
import os
import re
import copy
import numpy as np
import pandas as pd
import sys
sys.path.insert(0, '/project/6011811/data/microbiome_OJS/workflow/src/')
from MicroBiome import MicroBiomeDataSet, Trainer, TrainTester, MultiTrainTester, list_transformer, DiffExpTransform
from ScoreFunctions import *
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn import linear_model as LM
import seaborn as sns
import pickle as pk
from matplotlib import pyplot as plt
# Ignore warning messages
if True:
import warnings
warnings.filterwarnings('ignore')
input_dir = '/project/data/preprocessed/PE_50K_sex_complete'
output_dir = '/project/results/LR_Classifier_DE_w_clinical_generic'
retrain = True
fdr_DE = 0.05
# Parameters
input_dir = "results/kraken2_PE_25K/notebooks/PE_25K_Sex/prepped_data"
output_dir = "results/kraken2_PE_25K/notebooks/PE_25K_Sex/LR_DE_clinical"
retrain = True
fdr_DE = 0.05
os.listdir(input_dir)
['meta_data_mat.pk', 'metadata_samples_keep.csv', 'y.pk', 'feat_meta.csv', 'X.pk']
infile_X = open(os.path.join(input_dir, 'X.pk'),'rb')
X = pk.load(infile_X)
infile_X.close()
infile_y = open(os.path.join(input_dir, 'y.pk'),'rb')
y = pk.load(infile_y)
infile_y.close()
infile_meta_data_mat = open(os.path.join(input_dir, 'meta_data_mat.pk'), 'rb')
meta_data_mat = pk.load(infile_meta_data_mat)
infile_meta_data_mat.close()
# model input
X_inp = np.concatenate([X, meta_data_mat], axis=1)
n_splits = 10
out_path = os.path.join(output_dir, 'MyMultiTrainTester.pk')
if retrain:
# clear previous results, if any
if os.path.exists(output_dir):
os.system('rm -rf ' + output_dir)
os.mkdir(output_dir)
# we run PCA on microbiome data and leave clinical data intact
MyLogistic = LM.LogisticRegression(random_state=42, class_weight='balanced',
penalty='elasticnet', solver='saga', l1_ratio=0.5)
MatrixPipeLine = Pipeline([('DE0', DiffExpTransform(fdr=fdr_DE)), ('scaler0', StandardScaler()), ('pca', PCA())])
MyPipeLine = ColumnTransformer([('MatrixPipeLine', MatrixPipeLine, np.arange(0, X.shape[1])),
('MetaData', 'passthrough', [X.shape[1]])])
clf = Pipeline([('FullData', MyPipeLine), ('LR', MyLogistic)])
param_grid = dict(LR__C=np.exp(-np.arange(-10, 10)),
FullData__MatrixPipeLine__pca__n_components=np.arange(3,4))
model=model_selection.GridSearchCV(clf, param_grid, scoring=metrics.make_scorer(metrics.f1_score), cv = 5)
# Trainer
MyTrainer = Trainer(model=model)
# random seed used in class definition is not used in final output models
MyTrainTester = TrainTester(MyTrainer, metrics.f1_score)
# note that random seed here affects sequence of seeds passed to making new TrainTester objects
# using LRTrainTester as template. Thus, you have all settings but seed affecting sample split
# across all data splits
MyMultiTrainTester = MultiTrainTester(MyTrainTester, numpy_rand_seed=42, n_splits=n_splits)
MyMultiTrainTester.train(X_inp, y)
# save results
outfile = open(out_path,'wb')
pk.dump(MyMultiTrainTester, outfile)
outfile.close()
else:
# load previous results
infile = open(out_path,'rb')
MyMultiTrainTester = pk.load(infile)
infile.close()
Running for split 1 of 10 Using predict_proba getting predictions from probs Running for split 2 of 10 Using predict_proba getting predictions from probs Running for split 3 of 10 Using predict_proba getting predictions from probs Running for split 4 of 10 Using predict_proba getting predictions from probs Running for split 5 of 10 Using predict_proba getting predictions from probs Running for split 6 of 10 Using predict_proba getting predictions from probs Running for split 7 of 10 Using predict_proba getting predictions from probs Running for split 8 of 10 Using predict_proba getting predictions from probs Running for split 9 of 10 Using predict_proba getting predictions from probs Running for split 10 of 10 Using predict_proba getting predictions from probs
scores_df = pd.DataFrame({'score': MyMultiTrainTester.train_scores, 'stage' : np.repeat('train', n_splits)})
scores_df = scores_df.append(pd.DataFrame({'score': MyMultiTrainTester.test_scores, 'stage' : np.repeat('test', n_splits)}))
scores_df
| score | stage | |
|---|---|---|
| 0 | 0.773585 | train |
| 1 | 0.790123 | train |
| 2 | 0.776471 | train |
| 3 | 0.759259 | train |
| 4 | 0.761905 | train |
| 5 | 0.761905 | train |
| 6 | 0.779874 | train |
| 7 | 0.765273 | train |
| 8 | 0.759494 | train |
| 9 | 0.750809 | train |
| 0 | 0.790698 | test |
| 1 | 0.738095 | test |
| 2 | 0.735294 | test |
| 3 | 0.786517 | test |
| 4 | 0.786517 | test |
| 5 | 0.786517 | test |
| 6 | 0.746667 | test |
| 7 | 0.835165 | test |
| 8 | 0.771084 | test |
| 9 | 0.831461 | test |
sns.boxplot(data = scores_df, x = 'stage', y = 'score')
<AxesSubplot:xlabel='stage', ylabel='score'>
# hyperparams = {'l1_ratio': [], 'C': []}
feats_in_split = []
hyperparams = {'C': [], 'n_feats': [], 'n_components': []}
for i in range(n_splits):
hyperparams['C'].append(MyMultiTrainTester.TrainerList[i].model.best_params_['LR__C'])
selected_feats_i = copy.deepcopy(MyMultiTrainTester.TrainerList[i].model.best_estimator_['FullData'].transformers_[0][1]['DE0'].selected_feats)
feats_in_split.append(selected_feats_i)
hyperparams['n_feats'].append(np.sum(selected_feats_i))
hyperparams['n_components'].append(MyMultiTrainTester.TrainerList[i].model.best_params_['FullData__MatrixPipeLine__pca__n_components'])
hyperparams_df = pd.DataFrame(hyperparams)
hyperparams_df
| C | n_feats | n_components | |
|---|---|---|---|
| 0 | 0.367879 | 34 | 3 |
| 1 | 22026.465795 | 59 | 3 |
| 2 | 22026.465795 | 33 | 3 |
| 3 | 22026.465795 | 24 | 3 |
| 4 | 22026.465795 | 41 | 3 |
| 5 | 22026.465795 | 41 | 3 |
| 6 | 0.367879 | 59 | 3 |
| 7 | 22026.465795 | 32 | 3 |
| 8 | 0.135335 | 38 | 3 |
| 9 | 22026.465795 | 52 | 3 |
scoring_metrics = MyMultiTrainTester.getScores()
scoring_metrics_df = pd.DataFrame(scoring_metrics)
scoring_metrics_df.to_csv(os.path.join(output_dir, 'scoring_metrics.csv'))
sns.set(rc={"figure.figsize":(12, 8)})
sns.boxplot(data = scoring_metrics_df, x = 'score_type', y = 'value')
<AxesSubplot:xlabel='score_type', ylabel='value'>
score_names = np.unique(scoring_metrics['score_type'])
score_stats = {'score': [], 'mean': [], 'median': [], 'std_dev': []}
for score in score_names:
score_stats['score'].append(score)
score_vect = scoring_metrics_df['value'].to_numpy()[scoring_metrics_df['score_type'] == score]
score_stats['mean'].append(np.mean(score_vect))
score_stats['median'].append(np.median(score_vect))
score_stats['std_dev'].append(np.std(score_vect))
score_stats_df = pd.DataFrame(score_stats)
score_stats_df
| score | mean | median | std_dev | |
|---|---|---|---|---|
| 0 | AUPRC_NEG | 0.748137 | 0.757587 | 0.054356 |
| 1 | AUPRC_POS | 0.855668 | 0.867828 | 0.049933 |
| 2 | AUROC_NEG | 0.817979 | 0.822345 | 0.025676 |
| 3 | AUROC_POS | 0.817979 | 0.822345 | 0.025676 |
| 4 | f1_score | 0.780801 | 0.786517 | 0.032973 |
| 5 | npv_score | 0.702303 | 0.686607 | 0.052909 |
| 6 | ppv_score | 0.808149 | 0.812361 | 0.036192 |
| 7 | sensitivity | 0.757376 | 0.744681 | 0.051628 |
| 8 | specificity | 0.759180 | 0.766407 | 0.045476 |
score_stats_df.to_csv('score_stats.csv')
MyMultiTrainTester.plot_confusion(normalize=True, figsize=(15,25))
MyMultiTrainTester.plot_confusion(normalize=False, figsize=(15,25))
MyMultiTrainTester.plot_class_freq(normalize=True, figsize=(15,35))
MyMultiTrainTester.plot_precrecall(figsize=(15,35))
This notebook in intended as a generic notebook to be used with the papermill python library to allow automated generation of analyses and reports for classifiers on microbiome data generated by kraken2 pipeline
cd /project/src
[Errno 2] No such file or directory: '/project/src' /project/6011811/data/microbiome_OJS/workflow
from sklearn import model_selection
from sklearn import metrics
import os
import re
import copy
import numpy as np
import pandas as pd
import sys
sys.path.insert(0, '/project/6011811/data/microbiome_OJS/workflow/src/')
from MicroBiome import MicroBiomeDataSet, Trainer, TrainTester, MultiTrainTester, list_transformer, DiffExpTransform
from ScoreFunctions import *
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn import linear_model as LM
import seaborn as sns
import pickle as pk
from matplotlib import pyplot as plt
# Ignore warning messages
if True:
import warnings
warnings.filterwarnings('ignore')
input_dir = '/project/data/preprocessed/PE_50K_sex_complete'
output_dir = '/project/results/LR_Classifier_DE_w_clinical_generic'
retrain = True
fdr_DE = 0.05
# Parameters
input_dir = "results/kraken2_PE_50K/notebooks/PE_50K_Sex/prepped_data"
output_dir = "results/kraken2_PE_50K/notebooks/PE_50K_Sex/LR_DE_clinical"
retrain = True
fdr_DE = 0.05
os.listdir(input_dir)
['meta_data_mat.pk', 'metadata_samples_keep.csv', 'y.pk', 'feat_meta.csv', 'X.pk']
infile_X = open(os.path.join(input_dir, 'X.pk'),'rb')
X = pk.load(infile_X)
infile_X.close()
infile_y = open(os.path.join(input_dir, 'y.pk'),'rb')
y = pk.load(infile_y)
infile_y.close()
infile_meta_data_mat = open(os.path.join(input_dir, 'meta_data_mat.pk'), 'rb')
meta_data_mat = pk.load(infile_meta_data_mat)
infile_meta_data_mat.close()
# model input
X_inp = np.concatenate([X, meta_data_mat], axis=1)
n_splits = 10
out_path = os.path.join(output_dir, 'MyMultiTrainTester.pk')
if retrain:
# clear previous results, if any
if os.path.exists(output_dir):
os.system('rm -rf ' + output_dir)
os.mkdir(output_dir)
# we run PCA on microbiome data and leave clinical data intact
MyLogistic = LM.LogisticRegression(random_state=42, class_weight='balanced',
penalty='elasticnet', solver='saga', l1_ratio=0.5)
MatrixPipeLine = Pipeline([('DE0', DiffExpTransform(fdr=fdr_DE)), ('scaler0', StandardScaler()), ('pca', PCA())])
MyPipeLine = ColumnTransformer([('MatrixPipeLine', MatrixPipeLine, np.arange(0, X.shape[1])),
('MetaData', 'passthrough', [X.shape[1]])])
clf = Pipeline([('FullData', MyPipeLine), ('LR', MyLogistic)])
param_grid = dict(LR__C=np.exp(-np.arange(-10, 10)),
FullData__MatrixPipeLine__pca__n_components=np.arange(3,4))
model=model_selection.GridSearchCV(clf, param_grid, scoring=metrics.make_scorer(metrics.f1_score), cv = 5)
# Trainer
MyTrainer = Trainer(model=model)
# random seed used in class definition is not used in final output models
MyTrainTester = TrainTester(MyTrainer, metrics.f1_score)
# note that random seed here affects sequence of seeds passed to making new TrainTester objects
# using LRTrainTester as template. Thus, you have all settings but seed affecting sample split
# across all data splits
MyMultiTrainTester = MultiTrainTester(MyTrainTester, numpy_rand_seed=42, n_splits=n_splits)
MyMultiTrainTester.train(X_inp, y)
# save results
outfile = open(out_path,'wb')
pk.dump(MyMultiTrainTester, outfile)
outfile.close()
else:
# load previous results
infile = open(out_path,'rb')
MyMultiTrainTester = pk.load(infile)
infile.close()
Running for split 1 of 10 Using predict_proba getting predictions from probs Running for split 2 of 10 Using predict_proba getting predictions from probs Running for split 3 of 10 Using predict_proba getting predictions from probs Running for split 4 of 10 Using predict_proba getting predictions from probs Running for split 5 of 10 Using predict_proba getting predictions from probs Running for split 6 of 10 Using predict_proba getting predictions from probs Running for split 7 of 10 Using predict_proba getting predictions from probs Running for split 8 of 10 Using predict_proba getting predictions from probs Running for split 9 of 10 Using predict_proba getting predictions from probs Running for split 10 of 10 Using predict_proba getting predictions from probs
scores_df = pd.DataFrame({'score': MyMultiTrainTester.train_scores, 'stage' : np.repeat('train', n_splits)})
scores_df = scores_df.append(pd.DataFrame({'score': MyMultiTrainTester.test_scores, 'stage' : np.repeat('test', n_splits)}))
scores_df
| score | stage | |
|---|---|---|
| 0 | 0.761006 | train |
| 1 | 0.783951 | train |
| 2 | 0.795181 | train |
| 3 | 0.765432 | train |
| 4 | 0.775641 | train |
| 5 | 0.775641 | train |
| 6 | 0.783699 | train |
| 7 | 0.753247 | train |
| 8 | 0.767802 | train |
| 9 | 0.753247 | train |
| 0 | 0.795181 | test |
| 1 | 0.771084 | test |
| 2 | 0.724638 | test |
| 3 | 0.831461 | test |
| 4 | 0.777778 | test |
| 5 | 0.777778 | test |
| 6 | 0.729730 | test |
| 7 | 0.853933 | test |
| 8 | 0.771084 | test |
| 9 | 0.800000 | test |
sns.boxplot(data = scores_df, x = 'stage', y = 'score')
<AxesSubplot:xlabel='stage', ylabel='score'>
# hyperparams = {'l1_ratio': [], 'C': []}
feats_in_split = []
hyperparams = {'C': [], 'n_feats': [], 'n_components': []}
for i in range(n_splits):
hyperparams['C'].append(MyMultiTrainTester.TrainerList[i].model.best_params_['LR__C'])
selected_feats_i = copy.deepcopy(MyMultiTrainTester.TrainerList[i].model.best_estimator_['FullData'].transformers_[0][1]['DE0'].selected_feats)
feats_in_split.append(selected_feats_i)
hyperparams['n_feats'].append(np.sum(selected_feats_i))
hyperparams['n_components'].append(MyMultiTrainTester.TrainerList[i].model.best_params_['FullData__MatrixPipeLine__pca__n_components'])
hyperparams_df = pd.DataFrame(hyperparams)
hyperparams_df
| C | n_feats | n_components | |
|---|---|---|---|
| 0 | 1.000000 | 71 | 3 |
| 1 | 20.085537 | 87 | 3 |
| 2 | 22026.465795 | 67 | 3 |
| 3 | 2.718282 | 54 | 3 |
| 4 | 0.367879 | 79 | 3 |
| 5 | 0.367879 | 79 | 3 |
| 6 | 1.000000 | 80 | 3 |
| 7 | 0.367879 | 62 | 3 |
| 8 | 22026.465795 | 66 | 3 |
| 9 | 2.718282 | 90 | 3 |
scoring_metrics = MyMultiTrainTester.getScores()
scoring_metrics_df = pd.DataFrame(scoring_metrics)
scoring_metrics_df.to_csv(os.path.join(output_dir, 'scoring_metrics.csv'))
sns.set(rc={"figure.figsize":(12, 8)})
sns.boxplot(data = scoring_metrics_df, x = 'score_type', y = 'value')
<AxesSubplot:xlabel='score_type', ylabel='value'>
score_names = np.unique(scoring_metrics['score_type'])
score_stats = {'score': [], 'mean': [], 'median': [], 'std_dev': []}
for score in score_names:
score_stats['score'].append(score)
score_vect = scoring_metrics_df['value'].to_numpy()[scoring_metrics_df['score_type'] == score]
score_stats['mean'].append(np.mean(score_vect))
score_stats['median'].append(np.median(score_vect))
score_stats['std_dev'].append(np.std(score_vect))
score_stats_df = pd.DataFrame(score_stats)
score_stats_df
| score | mean | median | std_dev | |
|---|---|---|---|---|
| 0 | AUPRC_NEG | 0.750434 | 0.757013 | 0.052412 |
| 1 | AUPRC_POS | 0.865108 | 0.875572 | 0.048758 |
| 2 | AUROC_NEG | 0.826179 | 0.825229 | 0.028736 |
| 3 | AUROC_POS | 0.826179 | 0.825229 | 0.028736 |
| 4 | f1_score | 0.783267 | 0.777778 | 0.037946 |
| 5 | npv_score | 0.702349 | 0.710801 | 0.043974 |
| 6 | ppv_score | 0.818429 | 0.813953 | 0.043794 |
| 7 | sensitivity | 0.752272 | 0.744681 | 0.045315 |
| 8 | specificity | 0.778407 | 0.778989 | 0.046414 |
score_stats_df.to_csv('score_stats.csv')
MyMultiTrainTester.plot_confusion(normalize=True, figsize=(15,25))
MyMultiTrainTester.plot_confusion(normalize=False, figsize=(15,25))
MyMultiTrainTester.plot_class_freq(normalize=True, figsize=(15,35))
MyMultiTrainTester.plot_precrecall(figsize=(15,35))
This notebook in intended as a generic notebook to be used with the papermill python library to allow automated generation of analyses and reports for classifiers on microbiome data generated by kraken2 pipeline
cd /project/src
[Errno 2] No such file or directory: '/project/src' /project/6011811/data/microbiome_OJS/workflow
from sklearn import model_selection
from sklearn import metrics
import os
import re
import copy
import numpy as np
import pandas as pd
import sys
sys.path.insert(0, '/project/6011811/data/microbiome_OJS/workflow/src/')
from MicroBiome import MicroBiomeDataSet, Trainer, TrainTester, MultiTrainTester, list_transformer, DiffExpTransform
from ScoreFunctions import *
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn import linear_model as LM
import seaborn as sns
import pickle as pk
from matplotlib import pyplot as plt
# Ignore warning messages
if True:
import warnings
warnings.filterwarnings('ignore')
input_dir = '/project/data/preprocessed/PE_50K_sex_complete'
output_dir = '/project/results/LR_Classifier_DE_w_clinical_generic'
retrain = True
fdr_DE = 0.05
# Parameters
input_dir = "results/kraken2_PE_100K/notebooks/PE_100K_Sex/prepped_data"
output_dir = "results/kraken2_PE_100K/notebooks/PE_100K_Sex/LR_DE_clinical"
retrain = True
fdr_DE = 0.05
os.listdir(input_dir)
['meta_data_mat.pk', 'metadata_samples_keep.csv', 'y.pk', 'feat_meta.csv', 'X.pk']
infile_X = open(os.path.join(input_dir, 'X.pk'),'rb')
X = pk.load(infile_X)
infile_X.close()
infile_y = open(os.path.join(input_dir, 'y.pk'),'rb')
y = pk.load(infile_y)
infile_y.close()
infile_meta_data_mat = open(os.path.join(input_dir, 'meta_data_mat.pk'), 'rb')
meta_data_mat = pk.load(infile_meta_data_mat)
infile_meta_data_mat.close()
# model input
X_inp = np.concatenate([X, meta_data_mat], axis=1)
n_splits = 10
out_path = os.path.join(output_dir, 'MyMultiTrainTester.pk')
if retrain:
# clear previous results, if any
if os.path.exists(output_dir):
os.system('rm -rf ' + output_dir)
os.mkdir(output_dir)
# we run PCA on microbiome data and leave clinical data intact
MyLogistic = LM.LogisticRegression(random_state=42, class_weight='balanced',
penalty='elasticnet', solver='saga', l1_ratio=0.5)
MatrixPipeLine = Pipeline([('DE0', DiffExpTransform(fdr=fdr_DE)), ('scaler0', StandardScaler()), ('pca', PCA())])
MyPipeLine = ColumnTransformer([('MatrixPipeLine', MatrixPipeLine, np.arange(0, X.shape[1])),
('MetaData', 'passthrough', [X.shape[1]])])
clf = Pipeline([('FullData', MyPipeLine), ('LR', MyLogistic)])
param_grid = dict(LR__C=np.exp(-np.arange(-10, 10)),
FullData__MatrixPipeLine__pca__n_components=np.arange(3,4))
model=model_selection.GridSearchCV(clf, param_grid, scoring=metrics.make_scorer(metrics.f1_score), cv = 5)
# Trainer
MyTrainer = Trainer(model=model)
# random seed used in class definition is not used in final output models
MyTrainTester = TrainTester(MyTrainer, metrics.f1_score)
# note that random seed here affects sequence of seeds passed to making new TrainTester objects
# using LRTrainTester as template. Thus, you have all settings but seed affecting sample split
# across all data splits
MyMultiTrainTester = MultiTrainTester(MyTrainTester, numpy_rand_seed=42, n_splits=n_splits)
MyMultiTrainTester.train(X_inp, y)
# save results
outfile = open(out_path,'wb')
pk.dump(MyMultiTrainTester, outfile)
outfile.close()
else:
# load previous results
infile = open(out_path,'rb')
MyMultiTrainTester = pk.load(infile)
infile.close()
Running for split 1 of 10 Using predict_proba getting predictions from probs Running for split 2 of 10 Using predict_proba getting predictions from probs Running for split 3 of 10 Using predict_proba getting predictions from probs Running for split 4 of 10 Using predict_proba getting predictions from probs Running for split 5 of 10 Using predict_proba getting predictions from probs Running for split 6 of 10 Using predict_proba getting predictions from probs Running for split 7 of 10 Using predict_proba getting predictions from probs Running for split 8 of 10 Using predict_proba getting predictions from probs Running for split 9 of 10 Using predict_proba getting predictions from probs Running for split 10 of 10 Using predict_proba getting predictions from probs
scores_df = pd.DataFrame({'score': MyMultiTrainTester.train_scores, 'stage' : np.repeat('train', n_splits)})
scores_df = scores_df.append(pd.DataFrame({'score': MyMultiTrainTester.test_scores, 'stage' : np.repeat('test', n_splits)}))
scores_df
| score | stage | |
|---|---|---|
| 0 | 0.757282 | train |
| 1 | 0.794953 | train |
| 2 | 0.773006 | train |
| 3 | 0.768293 | train |
| 4 | 0.760383 | train |
| 5 | 0.760383 | train |
| 6 | 0.774603 | train |
| 7 | 0.759076 | train |
| 8 | 0.764890 | train |
| 9 | 0.753247 | train |
| 0 | 0.785714 | test |
| 1 | 0.756098 | test |
| 2 | 0.695652 | test |
| 3 | 0.818182 | test |
| 4 | 0.777778 | test |
| 5 | 0.777778 | test |
| 6 | 0.729730 | test |
| 7 | 0.860215 | test |
| 8 | 0.800000 | test |
| 9 | 0.818182 | test |
sns.boxplot(data = scores_df, x = 'stage', y = 'score')
<AxesSubplot:xlabel='stage', ylabel='score'>
# hyperparams = {'l1_ratio': [], 'C': []}
feats_in_split = []
hyperparams = {'C': [], 'n_feats': [], 'n_components': []}
for i in range(n_splits):
hyperparams['C'].append(MyMultiTrainTester.TrainerList[i].model.best_params_['LR__C'])
selected_feats_i = copy.deepcopy(MyMultiTrainTester.TrainerList[i].model.best_estimator_['FullData'].transformers_[0][1]['DE0'].selected_feats)
feats_in_split.append(selected_feats_i)
hyperparams['n_feats'].append(np.sum(selected_feats_i))
hyperparams['n_components'].append(MyMultiTrainTester.TrainerList[i].model.best_params_['FullData__MatrixPipeLine__pca__n_components'])
hyperparams_df = pd.DataFrame(hyperparams)
hyperparams_df
| C | n_feats | n_components | |
|---|---|---|---|
| 0 | 1.000000 | 80 | 3 |
| 1 | 2.718282 | 94 | 3 |
| 2 | 0.367879 | 78 | 3 |
| 3 | 22026.465795 | 60 | 3 |
| 4 | 1.000000 | 94 | 3 |
| 5 | 1.000000 | 94 | 3 |
| 6 | 22026.465795 | 107 | 3 |
| 7 | 1.000000 | 85 | 3 |
| 8 | 22026.465795 | 84 | 3 |
| 9 | 22026.465795 | 106 | 3 |
scoring_metrics = MyMultiTrainTester.getScores()
scoring_metrics_df = pd.DataFrame(scoring_metrics)
scoring_metrics_df.to_csv(os.path.join(output_dir, 'scoring_metrics.csv'))
sns.set(rc={"figure.figsize":(12, 8)})
sns.boxplot(data = scoring_metrics_df, x = 'score_type', y = 'value')
<AxesSubplot:xlabel='score_type', ylabel='value'>
score_names = np.unique(scoring_metrics['score_type'])
score_stats = {'score': [], 'mean': [], 'median': [], 'std_dev': []}
for score in score_names:
score_stats['score'].append(score)
score_vect = scoring_metrics_df['value'].to_numpy()[scoring_metrics_df['score_type'] == score]
score_stats['mean'].append(np.mean(score_vect))
score_stats['median'].append(np.median(score_vect))
score_stats['std_dev'].append(np.std(score_vect))
score_stats_df = pd.DataFrame(score_stats)
score_stats_df
| score | mean | median | std_dev | |
|---|---|---|---|---|
| 0 | AUPRC_NEG | 0.747095 | 0.750059 | 0.058689 |
| 1 | AUPRC_POS | 0.862509 | 0.876568 | 0.048247 |
| 2 | AUROC_NEG | 0.821452 | 0.814507 | 0.030427 |
| 3 | AUROC_POS | 0.821452 | 0.814507 | 0.030427 |
| 4 | f1_score | 0.781933 | 0.781746 | 0.044686 |
| 5 | npv_score | 0.705948 | 0.700881 | 0.049070 |
| 6 | ppv_score | 0.808715 | 0.813953 | 0.041853 |
| 7 | sensitivity | 0.758353 | 0.744681 | 0.058545 |
| 8 | specificity | 0.762844 | 0.774597 | 0.035262 |
score_stats_df.to_csv('score_stats.csv')
MyMultiTrainTester.plot_confusion(normalize=True, figsize=(15,25))
MyMultiTrainTester.plot_confusion(normalize=False, figsize=(15,25))
MyMultiTrainTester.plot_class_freq(normalize=True, figsize=(15,35))
MyMultiTrainTester.plot_precrecall(figsize=(15,35))
This notebook in intended as a generic notebook to be used with the papermill python library to allow automated generation of analyses and reports for classifiers on microbiome data generated by kraken2 pipeline
cd /project/src
[Errno 2] No such file or directory: '/project/src' /project/6011811/data/microbiome_OJS/workflow
from sklearn import model_selection
from sklearn import metrics
import os
import re
import copy
import numpy as np
import pandas as pd
import sys
sys.path.insert(0, '/project/6011811/data/microbiome_OJS/workflow/src/')
from MicroBiome import MicroBiomeDataSet, Trainer, TrainTester, MultiTrainTester, list_transformer, DiffExpTransform
from ScoreFunctions import *
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn import linear_model as LM
import seaborn as sns
import pickle as pk
from matplotlib import pyplot as plt
# Ignore warning messages
if True:
import warnings
warnings.filterwarnings('ignore')
input_dir = '/project/data/preprocessed/PE_50K_sex_complete'
output_dir = '/project/results/LR_Classifier_DE_w_clinical_generic'
retrain = True
fdr_DE = 0.05
# Parameters
input_dir = "results/kraken2_PE_500K/notebooks/PE_500K_Sex/prepped_data"
output_dir = "results/kraken2_PE_500K/notebooks/PE_500K_Sex/LR_DE_clinical"
retrain = True
fdr_DE = 0.05
os.listdir(input_dir)
['meta_data_mat.pk', 'metadata_samples_keep.csv', 'y.pk', 'feat_meta.csv', 'X.pk']
infile_X = open(os.path.join(input_dir, 'X.pk'),'rb')
X = pk.load(infile_X)
infile_X.close()
infile_y = open(os.path.join(input_dir, 'y.pk'),'rb')
y = pk.load(infile_y)
infile_y.close()
infile_meta_data_mat = open(os.path.join(input_dir, 'meta_data_mat.pk'), 'rb')
meta_data_mat = pk.load(infile_meta_data_mat)
infile_meta_data_mat.close()
# model input
X_inp = np.concatenate([X, meta_data_mat], axis=1)
n_splits = 10
out_path = os.path.join(output_dir, 'MyMultiTrainTester.pk')
if retrain:
# clear previous results, if any
if os.path.exists(output_dir):
os.system('rm -rf ' + output_dir)
os.mkdir(output_dir)
# we run PCA on microbiome data and leave clinical data intact
MyLogistic = LM.LogisticRegression(random_state=42, class_weight='balanced',
penalty='elasticnet', solver='saga', l1_ratio=0.5)
MatrixPipeLine = Pipeline([('DE0', DiffExpTransform(fdr=fdr_DE)), ('scaler0', StandardScaler()), ('pca', PCA())])
MyPipeLine = ColumnTransformer([('MatrixPipeLine', MatrixPipeLine, np.arange(0, X.shape[1])),
('MetaData', 'passthrough', [X.shape[1]])])
clf = Pipeline([('FullData', MyPipeLine), ('LR', MyLogistic)])
param_grid = dict(LR__C=np.exp(-np.arange(-10, 10)),
FullData__MatrixPipeLine__pca__n_components=np.arange(3,4))
model=model_selection.GridSearchCV(clf, param_grid, scoring=metrics.make_scorer(metrics.f1_score), cv = 5)
# Trainer
MyTrainer = Trainer(model=model)
# random seed used in class definition is not used in final output models
MyTrainTester = TrainTester(MyTrainer, metrics.f1_score)
# note that random seed here affects sequence of seeds passed to making new TrainTester objects
# using LRTrainTester as template. Thus, you have all settings but seed affecting sample split
# across all data splits
MyMultiTrainTester = MultiTrainTester(MyTrainTester, numpy_rand_seed=42, n_splits=n_splits)
MyMultiTrainTester.train(X_inp, y)
# save results
outfile = open(out_path,'wb')
pk.dump(MyMultiTrainTester, outfile)
outfile.close()
else:
# load previous results
infile = open(out_path,'rb')
MyMultiTrainTester = pk.load(infile)
infile.close()
Running for split 1 of 10 Using predict_proba getting predictions from probs Running for split 2 of 10 Using predict_proba getting predictions from probs Running for split 3 of 10 Using predict_proba getting predictions from probs Running for split 4 of 10 Using predict_proba getting predictions from probs Running for split 5 of 10 Using predict_proba getting predictions from probs Running for split 6 of 10 Using predict_proba getting predictions from probs Running for split 7 of 10 Using predict_proba getting predictions from probs Running for split 8 of 10 Using predict_proba getting predictions from probs Running for split 9 of 10 Using predict_proba getting predictions from probs Running for split 10 of 10 Using predict_proba getting predictions from probs
scores_df = pd.DataFrame({'score': MyMultiTrainTester.train_scores, 'stage' : np.repeat('train', n_splits)})
scores_df = scores_df.append(pd.DataFrame({'score': MyMultiTrainTester.test_scores, 'stage' : np.repeat('test', n_splits)}))
scores_df
| score | stage | |
|---|---|---|
| 0 | 0.766355 | train |
| 1 | 0.785047 | train |
| 2 | 0.788060 | train |
| 3 | 0.770642 | train |
| 4 | 0.767296 | train |
| 5 | 0.767296 | train |
| 6 | 0.780186 | train |
| 7 | 0.738562 | train |
| 8 | 0.770186 | train |
| 9 | 0.751592 | train |
| 0 | 0.785714 | test |
| 1 | 0.756098 | test |
| 2 | 0.724638 | test |
| 3 | 0.808989 | test |
| 4 | 0.786517 | test |
| 5 | 0.786517 | test |
| 6 | 0.720000 | test |
| 7 | 0.838710 | test |
| 8 | 0.776471 | test |
| 9 | 0.790698 | test |
sns.boxplot(data = scores_df, x = 'stage', y = 'score')
<AxesSubplot:xlabel='stage', ylabel='score'>
# hyperparams = {'l1_ratio': [], 'C': []}
feats_in_split = []
hyperparams = {'C': [], 'n_feats': [], 'n_components': []}
for i in range(n_splits):
hyperparams['C'].append(MyMultiTrainTester.TrainerList[i].model.best_params_['LR__C'])
selected_feats_i = copy.deepcopy(MyMultiTrainTester.TrainerList[i].model.best_estimator_['FullData'].transformers_[0][1]['DE0'].selected_feats)
feats_in_split.append(selected_feats_i)
hyperparams['n_feats'].append(np.sum(selected_feats_i))
hyperparams['n_components'].append(MyMultiTrainTester.TrainerList[i].model.best_params_['FullData__MatrixPipeLine__pca__n_components'])
hyperparams_df = pd.DataFrame(hyperparams)
hyperparams_df
| C | n_feats | n_components | |
|---|---|---|---|
| 0 | 22026.465795 | 105 | 3 |
| 1 | 22026.465795 | 128 | 3 |
| 2 | 7.389056 | 103 | 3 |
| 3 | 22026.465795 | 86 | 3 |
| 4 | 1.000000 | 110 | 3 |
| 5 | 1.000000 | 110 | 3 |
| 6 | 2.718282 | 126 | 3 |
| 7 | 22026.465795 | 100 | 3 |
| 8 | 0.367879 | 104 | 3 |
| 9 | 22026.465795 | 132 | 3 |
scoring_metrics = MyMultiTrainTester.getScores()
scoring_metrics_df = pd.DataFrame(scoring_metrics)
scoring_metrics_df.to_csv(os.path.join(output_dir, 'scoring_metrics.csv'))
sns.set(rc={"figure.figsize":(12, 8)})
sns.boxplot(data = scoring_metrics_df, x = 'score_type', y = 'value')
<AxesSubplot:xlabel='score_type', ylabel='value'>
score_names = np.unique(scoring_metrics['score_type'])
score_stats = {'score': [], 'mean': [], 'median': [], 'std_dev': []}
for score in score_names:
score_stats['score'].append(score)
score_vect = scoring_metrics_df['value'].to_numpy()[scoring_metrics_df['score_type'] == score]
score_stats['mean'].append(np.mean(score_vect))
score_stats['median'].append(np.median(score_vect))
score_stats['std_dev'].append(np.std(score_vect))
score_stats_df = pd.DataFrame(score_stats)
score_stats_df
| score | mean | median | std_dev | |
|---|---|---|---|---|
| 0 | AUPRC_NEG | 0.745564 | 0.753783 | 0.054040 |
| 1 | AUPRC_POS | 0.861446 | 0.877002 | 0.046113 |
| 2 | AUROC_NEG | 0.821141 | 0.813709 | 0.027055 |
| 3 | AUROC_POS | 0.821141 | 0.813709 | 0.027055 |
| 4 | f1_score | 0.777435 | 0.786116 | 0.034208 |
| 5 | npv_score | 0.697717 | 0.697222 | 0.036335 |
| 6 | ppv_score | 0.805913 | 0.821591 | 0.035936 |
| 7 | sensitivity | 0.752019 | 0.741388 | 0.044201 |
| 8 | specificity | 0.759638 | 0.758621 | 0.027999 |
score_stats_df.to_csv('score_stats.csv')
MyMultiTrainTester.plot_confusion(normalize=True, figsize=(15,25))
MyMultiTrainTester.plot_confusion(normalize=False, figsize=(15,25))
MyMultiTrainTester.plot_class_freq(normalize=True, figsize=(15,35))
MyMultiTrainTester.plot_precrecall(figsize=(15,35))
This notebook in intended as a generic notebook to be used with the papermill python library to allow automated generation of analyses and reports for classifiers on microbiome data generated by kraken2 pipeline
cd /project/src
[Errno 2] No such file or directory: '/project/src' /project/6011811/data/microbiome_OJS/workflow
from sklearn import model_selection
from sklearn import metrics
import os
import re
import copy
import numpy as np
import pandas as pd
import sys
sys.path.insert(0, '/project/6011811/data/microbiome_OJS/workflow/src/')
from MicroBiome import MicroBiomeDataSet, Trainer, TrainTester, MultiTrainTester, list_transformer, DiffExpTransform
from ScoreFunctions import *
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn import linear_model as LM
import seaborn as sns
import pickle as pk
from matplotlib import pyplot as plt
# Ignore warning messages
if True:
import warnings
warnings.filterwarnings('ignore')
input_dir = '/project/data/preprocessed/PE_50K_sex_complete'
output_dir = '/project/results/LR_Classifier_DE_w_clinical_generic'
retrain = True
fdr_DE = 0.05
# Parameters
input_dir = "results/kraken2_PE_1M/notebooks/PE_1M_Sex/prepped_data"
output_dir = "results/kraken2_PE_1M/notebooks/PE_1M_Sex/LR_DE_clinical"
retrain = True
fdr_DE = 0.05
os.listdir(input_dir)
['meta_data_mat.pk', 'metadata_samples_keep.csv', 'y.pk', 'feat_meta.csv', 'X.pk']
infile_X = open(os.path.join(input_dir, 'X.pk'),'rb')
X = pk.load(infile_X)
infile_X.close()
infile_y = open(os.path.join(input_dir, 'y.pk'),'rb')
y = pk.load(infile_y)
infile_y.close()
infile_meta_data_mat = open(os.path.join(input_dir, 'meta_data_mat.pk'), 'rb')
meta_data_mat = pk.load(infile_meta_data_mat)
infile_meta_data_mat.close()
# model input
X_inp = np.concatenate([X, meta_data_mat], axis=1)
n_splits = 10
out_path = os.path.join(output_dir, 'MyMultiTrainTester.pk')
if retrain:
# clear previous results, if any
if os.path.exists(output_dir):
os.system('rm -rf ' + output_dir)
os.mkdir(output_dir)
# we run PCA on microbiome data and leave clinical data intact
MyLogistic = LM.LogisticRegression(random_state=42, class_weight='balanced',
penalty='elasticnet', solver='saga', l1_ratio=0.5)
MatrixPipeLine = Pipeline([('DE0', DiffExpTransform(fdr=fdr_DE)), ('scaler0', StandardScaler()), ('pca', PCA())])
MyPipeLine = ColumnTransformer([('MatrixPipeLine', MatrixPipeLine, np.arange(0, X.shape[1])),
('MetaData', 'passthrough', [X.shape[1]])])
clf = Pipeline([('FullData', MyPipeLine), ('LR', MyLogistic)])
param_grid = dict(LR__C=np.exp(-np.arange(-10, 10)),
FullData__MatrixPipeLine__pca__n_components=np.arange(3,4))
model=model_selection.GridSearchCV(clf, param_grid, scoring=metrics.make_scorer(metrics.f1_score), cv = 5)
# Trainer
MyTrainer = Trainer(model=model)
# random seed used in class definition is not used in final output models
MyTrainTester = TrainTester(MyTrainer, metrics.f1_score)
# note that random seed here affects sequence of seeds passed to making new TrainTester objects
# using LRTrainTester as template. Thus, you have all settings but seed affecting sample split
# across all data splits
MyMultiTrainTester = MultiTrainTester(MyTrainTester, numpy_rand_seed=42, n_splits=n_splits)
MyMultiTrainTester.train(X_inp, y)
# save results
outfile = open(out_path,'wb')
pk.dump(MyMultiTrainTester, outfile)
outfile.close()
else:
# load previous results
infile = open(out_path,'rb')
MyMultiTrainTester = pk.load(infile)
infile.close()
Running for split 1 of 10 Using predict_proba getting predictions from probs Running for split 2 of 10 Using predict_proba getting predictions from probs Running for split 3 of 10 Using predict_proba getting predictions from probs Running for split 4 of 10 Using predict_proba getting predictions from probs Running for split 5 of 10 Using predict_proba getting predictions from probs Running for split 6 of 10 Using predict_proba getting predictions from probs Running for split 7 of 10 Using predict_proba getting predictions from probs Running for split 8 of 10 Using predict_proba getting predictions from probs Running for split 9 of 10 Using predict_proba getting predictions from probs Running for split 10 of 10 Using predict_proba getting predictions from probs
scores_df = pd.DataFrame({'score': MyMultiTrainTester.train_scores, 'stage' : np.repeat('train', n_splits)})
scores_df = scores_df.append(pd.DataFrame({'score': MyMultiTrainTester.test_scores, 'stage' : np.repeat('test', n_splits)}))
scores_df
| score | stage | |
|---|---|---|
| 0 | 0.768750 | train |
| 1 | 0.788644 | train |
| 2 | 0.777108 | train |
| 3 | 0.766355 | train |
| 4 | 0.772152 | train |
| 5 | 0.772152 | train |
| 6 | 0.785047 | train |
| 7 | 0.751634 | train |
| 8 | 0.773994 | train |
| 9 | 0.740260 | train |
| 0 | 0.795181 | test |
| 1 | 0.765432 | test |
| 2 | 0.724638 | test |
| 3 | 0.808989 | test |
| 4 | 0.795699 | test |
| 5 | 0.795699 | test |
| 6 | 0.720000 | test |
| 7 | 0.847826 | test |
| 8 | 0.761905 | test |
| 9 | 0.800000 | test |
sns.boxplot(data = scores_df, x = 'stage', y = 'score')
<AxesSubplot:xlabel='stage', ylabel='score'>
# hyperparams = {'l1_ratio': [], 'C': []}
feats_in_split = []
hyperparams = {'C': [], 'n_feats': [], 'n_components': []}
for i in range(n_splits):
hyperparams['C'].append(MyMultiTrainTester.TrainerList[i].model.best_params_['LR__C'])
selected_feats_i = copy.deepcopy(MyMultiTrainTester.TrainerList[i].model.best_estimator_['FullData'].transformers_[0][1]['DE0'].selected_feats)
feats_in_split.append(selected_feats_i)
hyperparams['n_feats'].append(np.sum(selected_feats_i))
hyperparams['n_components'].append(MyMultiTrainTester.TrainerList[i].model.best_params_['FullData__MatrixPipeLine__pca__n_components'])
hyperparams_df = pd.DataFrame(hyperparams)
hyperparams_df
| C | n_feats | n_components | |
|---|---|---|---|
| 0 | 1.000000 | 105 | 3 |
| 1 | 1.000000 | 127 | 3 |
| 2 | 1.000000 | 102 | 3 |
| 3 | 0.367879 | 91 | 3 |
| 4 | 1.000000 | 107 | 3 |
| 5 | 1.000000 | 107 | 3 |
| 6 | 54.598150 | 122 | 3 |
| 7 | 22026.465795 | 102 | 3 |
| 8 | 22026.465795 | 104 | 3 |
| 9 | 1.000000 | 135 | 3 |
scoring_metrics = MyMultiTrainTester.getScores()
scoring_metrics_df = pd.DataFrame(scoring_metrics)
scoring_metrics_df.to_csv(os.path.join(output_dir, 'scoring_metrics.csv'))
sns.set(rc={"figure.figsize":(12, 8)})
sns.boxplot(data = scoring_metrics_df, x = 'score_type', y = 'value')
<AxesSubplot:xlabel='score_type', ylabel='value'>
score_names = np.unique(scoring_metrics['score_type'])
score_stats = {'score': [], 'mean': [], 'median': [], 'std_dev': []}
for score in score_names:
score_stats['score'].append(score)
score_vect = scoring_metrics_df['value'].to_numpy()[scoring_metrics_df['score_type'] == score]
score_stats['mean'].append(np.mean(score_vect))
score_stats['median'].append(np.median(score_vect))
score_stats['std_dev'].append(np.std(score_vect))
score_stats_df = pd.DataFrame(score_stats)
score_stats_df
| score | mean | median | std_dev | |
|---|---|---|---|---|
| 0 | AUPRC_NEG | 0.737818 | 0.754843 | 0.059767 |
| 1 | AUPRC_POS | 0.858921 | 0.874124 | 0.048066 |
| 2 | AUROC_NEG | 0.815883 | 0.804040 | 0.031995 |
| 3 | AUROC_POS | 0.815883 | 0.804040 | 0.031995 |
| 4 | f1_score | 0.781537 | 0.795440 | 0.037074 |
| 5 | npv_score | 0.703134 | 0.701351 | 0.035207 |
| 6 | ppv_score | 0.807562 | 0.804348 | 0.037089 |
| 7 | sensitivity | 0.758307 | 0.762665 | 0.047790 |
| 8 | specificity | 0.758401 | 0.760731 | 0.044646 |
score_stats_df.to_csv('score_stats.csv')
MyMultiTrainTester.plot_confusion(normalize=True, figsize=(15,25))
MyMultiTrainTester.plot_confusion(normalize=False, figsize=(15,25))
MyMultiTrainTester.plot_class_freq(normalize=True, figsize=(15,35))
MyMultiTrainTester.plot_precrecall(figsize=(15,35))
This notebook in intended as a generic notebook to be used with the papermill python library to allow automated generation of analyses and reports for classifiers on microbiome data generated by kraken2 pipeline
cd /project/src
[Errno 2] No such file or directory: '/project/src' /project/6011811/data/microbiome_OJS/workflow
from sklearn import model_selection
from sklearn import metrics
import os
import re
import copy
import numpy as np
import pandas as pd
import sys
sys.path.insert(0, '/project/6011811/data/microbiome_OJS/workflow/src/')
from MicroBiome import MicroBiomeDataSet, Trainer, TrainTester, MultiTrainTester, list_transformer, DiffExpTransform
from ScoreFunctions import *
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn import linear_model as LM
import seaborn as sns
import pickle as pk
from matplotlib import pyplot as plt
# Ignore warning messages
if True:
import warnings
warnings.filterwarnings('ignore')
input_dir = '/project/data/preprocessed/PE_50K_sex_complete'
output_dir = '/project/results/LR_Classifier_DE_w_clinical_generic'
retrain = True
fdr_DE = 0.05
# Parameters
input_dir = "results/kraken2_PE_5M/notebooks/PE_5M_Sex/prepped_data"
output_dir = "results/kraken2_PE_5M/notebooks/PE_5M_Sex/LR_DE_clinical"
retrain = True
fdr_DE = 0.05
os.listdir(input_dir)
['meta_data_mat.pk', 'metadata_samples_keep.csv', 'y.pk', 'feat_meta.csv', 'X.pk']
infile_X = open(os.path.join(input_dir, 'X.pk'),'rb')
X = pk.load(infile_X)
infile_X.close()
infile_y = open(os.path.join(input_dir, 'y.pk'),'rb')
y = pk.load(infile_y)
infile_y.close()
infile_meta_data_mat = open(os.path.join(input_dir, 'meta_data_mat.pk'), 'rb')
meta_data_mat = pk.load(infile_meta_data_mat)
infile_meta_data_mat.close()
# model input
X_inp = np.concatenate([X, meta_data_mat], axis=1)
n_splits = 10
out_path = os.path.join(output_dir, 'MyMultiTrainTester.pk')
if retrain:
# clear previous results, if any
if os.path.exists(output_dir):
os.system('rm -rf ' + output_dir)
os.mkdir(output_dir)
# we run PCA on microbiome data and leave clinical data intact
MyLogistic = LM.LogisticRegression(random_state=42, class_weight='balanced',
penalty='elasticnet', solver='saga', l1_ratio=0.5)
MatrixPipeLine = Pipeline([('DE0', DiffExpTransform(fdr=fdr_DE)), ('scaler0', StandardScaler()), ('pca', PCA())])
MyPipeLine = ColumnTransformer([('MatrixPipeLine', MatrixPipeLine, np.arange(0, X.shape[1])),
('MetaData', 'passthrough', [X.shape[1]])])
clf = Pipeline([('FullData', MyPipeLine), ('LR', MyLogistic)])
param_grid = dict(LR__C=np.exp(-np.arange(-10, 10)),
FullData__MatrixPipeLine__pca__n_components=np.arange(3,4))
model=model_selection.GridSearchCV(clf, param_grid, scoring=metrics.make_scorer(metrics.f1_score), cv = 5)
# Trainer
MyTrainer = Trainer(model=model)
# random seed used in class definition is not used in final output models
MyTrainTester = TrainTester(MyTrainer, metrics.f1_score)
# note that random seed here affects sequence of seeds passed to making new TrainTester objects
# using LRTrainTester as template. Thus, you have all settings but seed affecting sample split
# across all data splits
MyMultiTrainTester = MultiTrainTester(MyTrainTester, numpy_rand_seed=42, n_splits=n_splits)
MyMultiTrainTester.train(X_inp, y)
# save results
outfile = open(out_path,'wb')
pk.dump(MyMultiTrainTester, outfile)
outfile.close()
else:
# load previous results
infile = open(out_path,'rb')
MyMultiTrainTester = pk.load(infile)
infile.close()
Running for split 1 of 10 Using predict_proba getting predictions from probs Running for split 2 of 10 Using predict_proba getting predictions from probs Running for split 3 of 10 Using predict_proba getting predictions from probs Running for split 4 of 10 Using predict_proba getting predictions from probs Running for split 5 of 10 Using predict_proba getting predictions from probs Running for split 6 of 10 Using predict_proba getting predictions from probs Running for split 7 of 10 Using predict_proba getting predictions from probs Running for split 8 of 10 Using predict_proba getting predictions from probs Running for split 9 of 10 Using predict_proba getting predictions from probs Running for split 10 of 10 Using predict_proba getting predictions from probs
scores_df = pd.DataFrame({'score': MyMultiTrainTester.train_scores, 'stage' : np.repeat('train', n_splits)})
scores_df = scores_df.append(pd.DataFrame({'score': MyMultiTrainTester.test_scores, 'stage' : np.repeat('test', n_splits)}))
scores_df
| score | stage | |
|---|---|---|
| 0 | 0.773585 | train |
| 1 | 0.795031 | train |
| 2 | 0.801187 | train |
| 3 | 0.773006 | train |
| 4 | 0.768750 | train |
| 5 | 0.768750 | train |
| 6 | 0.777778 | train |
| 7 | 0.750000 | train |
| 8 | 0.771605 | train |
| 9 | 0.744337 | train |
| 0 | 0.771084 | test |
| 1 | 0.756098 | test |
| 2 | 0.724638 | test |
| 3 | 0.795455 | test |
| 4 | 0.774194 | test |
| 5 | 0.774194 | test |
| 6 | 0.702703 | test |
| 7 | 0.851064 | test |
| 8 | 0.790698 | test |
| 9 | 0.790698 | test |
sns.boxplot(data = scores_df, x = 'stage', y = 'score')
<AxesSubplot:xlabel='stage', ylabel='score'>
# hyperparams = {'l1_ratio': [], 'C': []}
feats_in_split = []
hyperparams = {'C': [], 'n_feats': [], 'n_components': []}
for i in range(n_splits):
hyperparams['C'].append(MyMultiTrainTester.TrainerList[i].model.best_params_['LR__C'])
selected_feats_i = copy.deepcopy(MyMultiTrainTester.TrainerList[i].model.best_estimator_['FullData'].transformers_[0][1]['DE0'].selected_feats)
feats_in_split.append(selected_feats_i)
hyperparams['n_feats'].append(np.sum(selected_feats_i))
hyperparams['n_components'].append(MyMultiTrainTester.TrainerList[i].model.best_params_['FullData__MatrixPipeLine__pca__n_components'])
hyperparams_df = pd.DataFrame(hyperparams)
hyperparams_df
| C | n_feats | n_components | |
|---|---|---|---|
| 0 | 22026.465795 | 113 | 3 |
| 1 | 22026.465795 | 126 | 3 |
| 2 | 22026.465795 | 103 | 3 |
| 3 | 1.000000 | 97 | 3 |
| 4 | 22026.465795 | 111 | 3 |
| 5 | 22026.465795 | 111 | 3 |
| 6 | 22026.465795 | 126 | 3 |
| 7 | 22026.465795 | 104 | 3 |
| 8 | 22026.465795 | 109 | 3 |
| 9 | 22026.465795 | 128 | 3 |
scoring_metrics = MyMultiTrainTester.getScores()
scoring_metrics_df = pd.DataFrame(scoring_metrics)
scoring_metrics_df.to_csv(os.path.join(output_dir, 'scoring_metrics.csv'))
sns.set(rc={"figure.figsize":(12, 8)})
sns.boxplot(data = scoring_metrics_df, x = 'score_type', y = 'value')
<AxesSubplot:xlabel='score_type', ylabel='value'>
score_names = np.unique(scoring_metrics['score_type'])
score_stats = {'score': [], 'mean': [], 'median': [], 'std_dev': []}
for score in score_names:
score_stats['score'].append(score)
score_vect = scoring_metrics_df['value'].to_numpy()[scoring_metrics_df['score_type'] == score]
score_stats['mean'].append(np.mean(score_vect))
score_stats['median'].append(np.median(score_vect))
score_stats['std_dev'].append(np.std(score_vect))
score_stats_df = pd.DataFrame(score_stats)
score_stats_df
| score | mean | median | std_dev | |
|---|---|---|---|---|
| 0 | AUPRC_NEG | 0.736470 | 0.749466 | 0.052032 |
| 1 | AUPRC_POS | 0.859199 | 0.877387 | 0.049260 |
| 2 | AUROC_NEG | 0.816787 | 0.808511 | 0.031085 |
| 3 | AUROC_POS | 0.816787 | 0.808511 | 0.031085 |
| 4 | f1_score | 0.773082 | 0.774194 | 0.038467 |
| 5 | npv_score | 0.693742 | 0.690079 | 0.043237 |
| 6 | ppv_score | 0.794977 | 0.782609 | 0.034955 |
| 7 | sensitivity | 0.753615 | 0.760757 | 0.051597 |
| 8 | specificity | 0.738948 | 0.749346 | 0.050381 |
score_stats_df.to_csv('score_stats.csv')
MyMultiTrainTester.plot_confusion(normalize=True, figsize=(15,25))
MyMultiTrainTester.plot_confusion(normalize=False, figsize=(15,25))
MyMultiTrainTester.plot_class_freq(normalize=True, figsize=(15,35))
MyMultiTrainTester.plot_precrecall(figsize=(15,35))
This notebook in intended as a generic notebook to be used with the papermill python library to allow automated generation of analyses and reports for classifiers on microbiome data generated by kraken2 pipeline
cd /project/src
[Errno 2] No such file or directory: '/project/src' /project/6011811/data/microbiome_OJS/workflow
from sklearn import model_selection
from sklearn import metrics
import os
import re
import copy
import numpy as np
import pandas as pd
import sys
sys.path.insert(0, '/project/6011811/data/microbiome_OJS/workflow/src/')
from MicroBiome import MicroBiomeDataSet, Trainer, TrainTester, MultiTrainTester, list_transformer, DiffExpTransform
from ScoreFunctions import *
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn import linear_model as LM
import seaborn as sns
import pickle as pk
from matplotlib import pyplot as plt
# Ignore warning messages
if True:
import warnings
warnings.filterwarnings('ignore')
input_dir = '/project/data/preprocessed/PE_50K_sex_complete'
output_dir = '/project/results/LR_Classifier_DE_w_clinical_generic'
retrain = True
fdr_DE = 0.05
# Parameters
input_dir = "results/kraken2_PE_10M/notebooks/PE_10M_Sex/prepped_data"
output_dir = "results/kraken2_PE_10M/notebooks/PE_10M_Sex/LR_DE_clinical"
retrain = True
fdr_DE = 0.05
os.listdir(input_dir)
['meta_data_mat.pk', 'metadata_samples_keep.csv', 'y.pk', 'feat_meta.csv', 'X.pk']
infile_X = open(os.path.join(input_dir, 'X.pk'),'rb')
X = pk.load(infile_X)
infile_X.close()
infile_y = open(os.path.join(input_dir, 'y.pk'),'rb')
y = pk.load(infile_y)
infile_y.close()
infile_meta_data_mat = open(os.path.join(input_dir, 'meta_data_mat.pk'), 'rb')
meta_data_mat = pk.load(infile_meta_data_mat)
infile_meta_data_mat.close()
# model input
X_inp = np.concatenate([X, meta_data_mat], axis=1)
n_splits = 10
out_path = os.path.join(output_dir, 'MyMultiTrainTester.pk')
if retrain:
# clear previous results, if any
if os.path.exists(output_dir):
os.system('rm -rf ' + output_dir)
os.mkdir(output_dir)
# we run PCA on microbiome data and leave clinical data intact
MyLogistic = LM.LogisticRegression(random_state=42, class_weight='balanced',
penalty='elasticnet', solver='saga', l1_ratio=0.5)
MatrixPipeLine = Pipeline([('DE0', DiffExpTransform(fdr=fdr_DE)), ('scaler0', StandardScaler()), ('pca', PCA())])
MyPipeLine = ColumnTransformer([('MatrixPipeLine', MatrixPipeLine, np.arange(0, X.shape[1])),
('MetaData', 'passthrough', [X.shape[1]])])
clf = Pipeline([('FullData', MyPipeLine), ('LR', MyLogistic)])
param_grid = dict(LR__C=np.exp(-np.arange(-10, 10)),
FullData__MatrixPipeLine__pca__n_components=np.arange(3,4))
model=model_selection.GridSearchCV(clf, param_grid, scoring=metrics.make_scorer(metrics.f1_score), cv = 5)
# Trainer
MyTrainer = Trainer(model=model)
# random seed used in class definition is not used in final output models
MyTrainTester = TrainTester(MyTrainer, metrics.f1_score)
# note that random seed here affects sequence of seeds passed to making new TrainTester objects
# using LRTrainTester as template. Thus, you have all settings but seed affecting sample split
# across all data splits
MyMultiTrainTester = MultiTrainTester(MyTrainTester, numpy_rand_seed=42, n_splits=n_splits)
MyMultiTrainTester.train(X_inp, y)
# save results
outfile = open(out_path,'wb')
pk.dump(MyMultiTrainTester, outfile)
outfile.close()
else:
# load previous results
infile = open(out_path,'rb')
MyMultiTrainTester = pk.load(infile)
infile.close()
Running for split 1 of 10 Using predict_proba getting predictions from probs Running for split 2 of 10 Using predict_proba getting predictions from probs Running for split 3 of 10 Using predict_proba getting predictions from probs Running for split 4 of 10 Using predict_proba getting predictions from probs Running for split 5 of 10 Using predict_proba getting predictions from probs Running for split 6 of 10 Using predict_proba getting predictions from probs Running for split 7 of 10 Using predict_proba getting predictions from probs Running for split 8 of 10 Using predict_proba getting predictions from probs Running for split 9 of 10 Using predict_proba getting predictions from probs Running for split 10 of 10 Using predict_proba getting predictions from probs
scores_df = pd.DataFrame({'score': MyMultiTrainTester.train_scores, 'stage' : np.repeat('train', n_splits)})
scores_df = scores_df.append(pd.DataFrame({'score': MyMultiTrainTester.test_scores, 'stage' : np.repeat('test', n_splits)}))
scores_df
| score | stage | |
|---|---|---|
| 0 | 0.766355 | train |
| 1 | 0.795031 | train |
| 2 | 0.787879 | train |
| 3 | 0.775385 | train |
| 4 | 0.776025 | train |
| 5 | 0.776025 | train |
| 6 | 0.775000 | train |
| 7 | 0.760656 | train |
| 8 | 0.773006 | train |
| 9 | 0.746753 | train |
| 0 | 0.785714 | test |
| 1 | 0.756098 | test |
| 2 | 0.735294 | test |
| 3 | 0.808989 | test |
| 4 | 0.747253 | test |
| 5 | 0.747253 | test |
| 6 | 0.702703 | test |
| 7 | 0.860215 | test |
| 8 | 0.761905 | test |
| 9 | 0.790698 | test |
sns.boxplot(data = scores_df, x = 'stage', y = 'score')
<AxesSubplot:xlabel='stage', ylabel='score'>
# hyperparams = {'l1_ratio': [], 'C': []}
feats_in_split = []
hyperparams = {'C': [], 'n_feats': [], 'n_components': []}
for i in range(n_splits):
hyperparams['C'].append(MyMultiTrainTester.TrainerList[i].model.best_params_['LR__C'])
selected_feats_i = copy.deepcopy(MyMultiTrainTester.TrainerList[i].model.best_estimator_['FullData'].transformers_[0][1]['DE0'].selected_feats)
feats_in_split.append(selected_feats_i)
hyperparams['n_feats'].append(np.sum(selected_feats_i))
hyperparams['n_components'].append(MyMultiTrainTester.TrainerList[i].model.best_params_['FullData__MatrixPipeLine__pca__n_components'])
hyperparams_df = pd.DataFrame(hyperparams)
hyperparams_df
| C | n_feats | n_components | |
|---|---|---|---|
| 0 | 22026.465795 | 109 | 3 |
| 1 | 7.389056 | 127 | 3 |
| 2 | 0.367879 | 102 | 3 |
| 3 | 22026.465795 | 94 | 3 |
| 4 | 0.367879 | 110 | 3 |
| 5 | 0.367879 | 110 | 3 |
| 6 | 2.718282 | 123 | 3 |
| 7 | 22026.465795 | 101 | 3 |
| 8 | 22026.465795 | 115 | 3 |
| 9 | 22026.465795 | 128 | 3 |
scoring_metrics = MyMultiTrainTester.getScores()
scoring_metrics_df = pd.DataFrame(scoring_metrics)
scoring_metrics_df.to_csv(os.path.join(output_dir, 'scoring_metrics.csv'))
sns.set(rc={"figure.figsize":(12, 8)})
sns.boxplot(data = scoring_metrics_df, x = 'score_type', y = 'value')
<AxesSubplot:xlabel='score_type', ylabel='value'>
score_names = np.unique(scoring_metrics['score_type'])
score_stats = {'score': [], 'mean': [], 'median': [], 'std_dev': []}
for score in score_names:
score_stats['score'].append(score)
score_vect = scoring_metrics_df['value'].to_numpy()[scoring_metrics_df['score_type'] == score]
score_stats['mean'].append(np.mean(score_vect))
score_stats['median'].append(np.median(score_vect))
score_stats['std_dev'].append(np.std(score_vect))
score_stats_df = pd.DataFrame(score_stats)
score_stats_df
| score | mean | median | std_dev | |
|---|---|---|---|---|
| 0 | AUPRC_NEG | 0.736508 | 0.743340 | 0.057949 |
| 1 | AUPRC_POS | 0.858589 | 0.871477 | 0.048014 |
| 2 | AUROC_NEG | 0.815562 | 0.808584 | 0.031208 |
| 3 | AUROC_POS | 0.815562 | 0.808584 | 0.031208 |
| 4 | f1_score | 0.769612 | 0.759001 | 0.041665 |
| 5 | npv_score | 0.687461 | 0.688686 | 0.058611 |
| 6 | ppv_score | 0.797170 | 0.778125 | 0.037569 |
| 7 | sensitivity | 0.745105 | 0.728369 | 0.054217 |
| 8 | specificity | 0.744896 | 0.749346 | 0.055904 |
score_stats_df.to_csv('score_stats.csv')
MyMultiTrainTester.plot_confusion(normalize=True, figsize=(15,25))
MyMultiTrainTester.plot_confusion(normalize=False, figsize=(15,25))
MyMultiTrainTester.plot_class_freq(normalize=True, figsize=(15,35))
MyMultiTrainTester.plot_precrecall(figsize=(15,35))